In [5]:
# Импорт необходимых библиотек
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Загрузка данных для Titanic и House Prices
def load_and_preprocess_titanic(filepath):
    data = pd.read_csv(filepath)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data.dropna(subset=['Embarked'], inplace=True)
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])
    features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    X = data[features]
    y = data['Survived']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def load_and_preprocess_house_data(filepath):
    data = pd.read_csv(filepath)
    data['LotFrontage'].fillna(data['LotFrontage'].median(), inplace=True)
    data = pd.get_dummies(data)
    features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF']
    X = data[features]
    y = data['SalePrice']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Загрузка и разделение данных
X_train_titanic, X_test_titanic, y_train_titanic, y_test_titanic = load_and_preprocess_titanic('titanic/train.csv')
X_train_house, X_test_house, y_train_house, y_test_house = load_and_preprocess_house_data('house-prices-advanced-regression-techniques/train.csv')

In [2]:
# Обучение модели Random Forest для классификации на данных Titanic
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
rf_clf.fit(X_train_titanic, y_train_titanic)

# Предсказания
y_pred_titanic = rf_clf.predict(X_test_titanic)

In [3]:
# Обучение модели Random Forest для регрессии на данных House Prices
rf_regr = RandomForestRegressor(random_state=42, n_estimators=100)
rf_regr.fit(X_train_house, y_train_house)

# Предсказания
y_pred_house = rf_regr.predict(X_test_house)

In [6]:
print("Random Forest Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic))

print("\nRandom Forest Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house))

Random Forest Classifier - Titanic Dataset:
Accuracy: 0.7640449438202247
Precision: 0.68
Recall: 0.7391304347826086
F1 Score: 0.7083333333333334

Random Forest Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 19993.524931237902
Mean Squared Error (MSE): 876487910.748193
Root Mean Squared Error (RMSE): 29605.538514747423
R-squared (R²): 0.8857300345418269


In [7]:
def enhanced_preprocess_titanic(filepath):
    data = pd.read_csv(filepath)
    # Интеллектуальное заполнение пропущенных значений
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data.groupby('Pclass')['Fare'].transform('median'), inplace=True)
    data.dropna(subset=['Embarked'], inplace=True)

    # Лог-преобразование тарифа для уменьшения воздействия выбросов
    data['Fare'] = np.log1p(data['Fare'])

    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])
    features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    X = data[features]
    y = data['Survived']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Аналогичная процедура для House Prices
def enhanced_preprocess_house(filepath):
    data = pd.read_csv(filepath)
    data['LotFrontage'].fillna(data.groupby('Neighborhood')['LotFrontage'].transform('median'), inplace=True)

    # Анализ важности признаков
    importances = rf_regr.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Используем только самые важные признаки
    important_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF']

    data = pd.get_dummies(data)
    X = data[important_features]
    y = data['SalePrice']
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_grid_classifier = {
    'n_estimators': [100, 200],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [None, 6, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_clf = GridSearchCV(estimator=rf_clf, param_grid=param_grid_classifier, cv=5, scoring='accuracy', n_jobs=-1)
grid_clf.fit(X_train_titanic, y_train_titanic)

print("Best parameters for Random Forest Classifier:", grid_clf.best_params_)

Best parameters for Random Forest Classifier: {'bootstrap': False, 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [11]:
# Обучение улучшенной модели Random Forest для Titanic
improved_rf_clf = RandomForestClassifier(**grid_clf.best_params_, random_state=42)
improved_rf_clf.fit(X_train_titanic, y_train_titanic)
y_pred_titanic_improved = improved_rf_clf.predict(X_test_titanic)

# Аналогичный процесс для Random Forest для House Prices с улучшением
grid_regr = RandomizedSearchCV(estimator=rf_regr, param_distributions=param_grid_classifier, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', n_iter=100, random_state=42)
grid_regr.fit(X_train_house, y_train_house)

improved_rf_regr = RandomForestRegressor(**grid_regr.best_params_, random_state=42)
improved_rf_regr.fit(X_train_house, y_train_house)
y_pred_house_improved = improved_rf_regr.predict(X_test_house)

In [12]:
print("Improved Random Forest Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic_improved))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic_improved))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic_improved))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic_improved))

print("\nImproved Random Forest Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house_improved))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house_improved))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house_improved)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house_improved))


Improved Random Forest Classifier - Titanic Dataset:
Accuracy: 0.7921348314606742
Precision: 0.7424242424242424
Recall: 0.7101449275362319
F1 Score: 0.725925925925926

Improved Random Forest Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 20588.888082848378
Mean Squared Error (MSE): 1081613086.9121847
Root Mean Squared Error (RMSE): 32887.88662885143
R-squared (R²): 0.8589873419074786


In [20]:
import numpy as np
from collections import Counter
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample

class SimpleRandomForestClassifier:
    def __init__(self, n_estimators=10, max_depth=None, bootstrap=True, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.trees = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        for _ in range(self.n_estimators):
            if self.bootstrap:
                X_sample, y_sample = resample(X, y, n_samples=len(X), random_state=self.random_state)
            else:
                X_sample, y_sample = X, y

            tree = DecisionTreeClassifier(max_depth=self.max_depth, random_state=self.random_state)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        tree_preds_majority = [Counter(tree_preds[:, i]).most_common(1)[0][0] for i in range(X.shape[0])]
        return np.array(tree_preds_majority)
    

In [21]:
from sklearn.tree import DecisionTreeRegressor

class SimpleRandomForestRegressor:
    def __init__(self, n_estimators=10, max_depth=None, bootstrap=True, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.trees = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        for _ in range(self.n_estimators):
            if self.bootstrap:
                X_sample, y_sample = resample(X, y, n_samples=len(X), random_state=self.random_state)
            else:
                X_sample, y_sample = X, y
            
            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=self.random_state)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)

In [22]:
# Обучаем простую модель случайного леса для Titanic
simple_rf_clf = SimpleRandomForestClassifier(n_estimators=10, max_depth=None, random_state=42)
simple_rf_clf.fit(X_train_titanic, y_train_titanic)
y_pred_titanic_simple = simple_rf_clf.predict(X_test_titanic)

In [23]:
# Обучаем простую модель случайного леса для House Prices
simple_rf_regr = SimpleRandomForestRegressor(n_estimators=10, max_depth=None, random_state=42)
simple_rf_regr.fit(X_train_house, y_train_house)
y_pred_house_simple = simple_rf_regr.predict(X_test_house)

In [24]:
print("Simple Random Forest Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic_simple))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic_simple))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic_simple))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic_simple))

print("\nSimple Random Forest Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house_simple))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house_simple))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house_simple)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house_simple))

Simple Random Forest Classifier - Titanic Dataset:
Accuracy: 0.7528089887640449
Precision: 0.6582278481012658
Recall: 0.7536231884057971
F1 Score: 0.7027027027027026

Simple Random Forest Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 28740.007990867576
Mean Squared Error (MSE): 1703506782.8245814
Root Mean Squared Error (RMSE): 41273.56033618352
R-squared (R²): 0.7779094738854275


In [26]:
def enhanced_preprocess_titanic(filepath):
    data = pd.read_csv(filepath)
    
    # Заполнение пропусков и лог-преобразование для устойчивости к выбросам
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Fare'].fillna(data.groupby('Pclass')['Fare'].transform('median'), inplace=True)
    data['Fare'] = np.log1p(data['Fare'])
    
    # Очистка данных
    data.dropna(subset=['Embarked'], inplace=True)

    # Кодирование категориальных признаков
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'])
    
    features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    X = data[features]
    y = data['Survived']
    
    return train_test_split(X, y, test_size=0.2, random_state=42)

X_train_titanic, X_test_titanic, y_train_titanic, y_test_titanic = enhanced_preprocess_titanic('titanic/train.csv')

In [27]:
def enhanced_preprocess_house(filepath):
    data = pd.read_csv(filepath)
    
    # Заполнение пропусков для LotFrontage
    data['LotFrontage'].fillna(data.groupby('Neighborhood')['LotFrontage'].transform('median'), inplace=True)

    # Важные признаки на основе предыдущего анализа
    important_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF']
    
    data = pd.get_dummies(data)
    X = data[important_features]
    y = data['SalePrice']
    
    return train_test_split(X, y, test_size=0.2, random_state=42)

X_train_house, X_test_house, y_train_house, y_test_house = enhanced_preprocess_house('house-prices-advanced-regression-techniques/train.csv')

In [28]:
# Улучшенное обучение моделей с обновленной предобработкой данных
improved_simple_rf_clf = SimpleRandomForestClassifier(n_estimators=20, max_depth=6, random_state=42)
improved_simple_rf_clf.fit(X_train_titanic, y_train_titanic)
y_pred_titanic_improved = improved_simple_rf_clf.predict(X_test_titanic)

In [29]:
# Улучшенное обучение моделей с обновленной предобработкой данных
improved_simple_rf_regr = SimpleRandomForestRegressor(n_estimators=20, max_depth=6, random_state=42)
improved_simple_rf_regr.fit(X_train_house, y_train_house)
y_pred_house_improved = improved_simple_rf_regr.predict(X_test_house)

In [30]:
print("Improved Simple Random Forest Classifier - Titanic Dataset:")
print("Accuracy:", accuracy_score(y_test_titanic, y_pred_titanic_improved))
print("Precision:", precision_score(y_test_titanic, y_pred_titanic_improved))
print("Recall:", recall_score(y_test_titanic, y_pred_titanic_improved))
print("F1 Score:", f1_score(y_test_titanic, y_pred_titanic_improved))

print("\nImproved Simple Random Forest Regressor - House Prices Dataset:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test_house, y_pred_house_improved))
print("Mean Squared Error (MSE):", mean_squared_error(y_test_house, y_pred_house_improved))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test_house, y_pred_house_improved)))
print("R-squared (R²):", r2_score(y_test_house, y_pred_house_improved))

Improved Simple Random Forest Classifier - Titanic Dataset:
Accuracy: 0.7640449438202247
Precision: 0.6901408450704225
Recall: 0.7101449275362319
F1 Score: 0.7

Improved Simple Random Forest Regressor - House Prices Dataset:
Mean Absolute Error (MAE): 25109.791981868377
Mean Squared Error (MSE): 1288310921.053171
Root Mean Squared Error (RMSE): 35893.04836668475
R-squared (R²): 0.8320396178397187
