In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

In [24]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [3]:
train_data = pd.read_csv('/content/drive/MyDrive/Обучение/Правильное построение кросс-валидации/train.csv')

In [4]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
train_data.isnull().sum()

Unnamed: 0,0
Id,0
MSSubClass,0
MSZoning,0
LotFrontage,259
LotArea,0
...,...
MoSold,0
YrSold,0
SaleType,0
SaleCondition,0


In [7]:
# Анализ пропущенных значений
missing_values = train_data.isnull().sum()
missing_percentage = (missing_values / len(train_data)) * 100
missing_df = pd.DataFrame({
    'Пропущенные значения': missing_values,
    'Процент': missing_percentage
})
missing_df = missing_df[missing_df['Пропущенные значения'] > 0].sort_values('Процент', ascending=False)

print(missing_df)

              Пропущенные значения    Процент
PoolQC                        1453  99.520548
MiscFeature                   1406  96.301370
Alley                         1369  93.767123
Fence                         1179  80.753425
MasVnrType                     872  59.726027
FireplaceQu                    690  47.260274
LotFrontage                    259  17.739726
GarageType                      81   5.547945
GarageYrBlt                     81   5.547945
GarageFinish                    81   5.547945
GarageQual                      81   5.547945
GarageCond                      81   5.547945
BsmtExposure                    38   2.602740
BsmtFinType2                    38   2.602740
BsmtQual                        37   2.534247
BsmtCond                        37   2.534247
BsmtFinType1                    37   2.534247
MasVnrArea                       8   0.547945
Electrical                       1   0.068493


In [10]:
# Функция предобработки данных
def simple_preprocess(df):
    df_processed = df.copy()

    # Удаляем Id
    if 'Id' in df_processed.columns:
        df_processed = df_processed.drop('Id', axis=1)

    # Логарифмирование целевой переменной
    y = np.log1p(df_processed['SalePrice'])
    X = df_processed.drop('SalePrice', axis=1)

    # Разделяем признаки
    numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

    # Заполняем пропущенные значения
    # Числовые - медианой
    for col in numerical_cols:
        X[col] = X[col].fillna(X[col].median())

    # Категориальные - модой
    for col in categorical_cols:
        X[col] = X[col].fillna(X[col].mode()[0] if len(X[col].mode()) > 0 else 'missing')

    # Масштабируем числовые признаки
    scaler = StandardScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

    # Кодируем категориальные признаки с помощью OneHotEncoding
    X_encoded = pd.get_dummies(X, columns=categorical_cols)

    feature_names = X_encoded.columns.tolist()

    return X_encoded.values, y, feature_names

In [11]:
# Предобработка данных
X_processed, y, feature_names = simple_preprocess(train_data)

In [14]:
# K-Fold Cross Validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Модели для сравнения
models = {
    'Random Forest': RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ),
    'Decision Tree': DecisionTreeRegressor(
        max_depth=10,
        random_state=42
    )
}

In [21]:
results = {}
for name, model in models.items():
    # Вычисляем R² score для каждого фолда
    cv_scores = cross_val_score(model, X_processed, y, cv=kf, scoring='r2')

    results[name] = {
        'scores': cv_scores,
        'mean_score': cv_scores.mean(),
        'std_score': cv_scores.std(),
        'min_score': cv_scores.min(),
        'max_score': cv_scores.max()
    }

    print(f"\n{name}:")
    print(f"  Средний R² score: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
    print(f"  Лучший score: {cv_scores.max():.4f}")
    print(f"  Худший score: {cv_scores.min():.4f}")
    print(f"  Scores по фолдам: {[f'{x:.4f}' for x in cv_scores]}")


Random Forest:
  Средний R² score: 0.8610 (±0.0441)
  Лучший score: 0.8913
  Худший score: 0.7737
  Scores по фолдам: ['0.8813', '0.8913', '0.7737', '0.8723', '0.8862']

Decision Tree:
  Средний R² score: 0.7280 (±0.0745)
  Лучший score: 0.8305
  Худший score: 0.6147
  Scores по фолдам: ['0.8305', '0.7472', '0.6147', '0.7689', '0.6790']


DECISION TREE CLASSIFIER TUNING

In [31]:
X_classification = X_processed
median_price = train_data['SalePrice'].median()
y_binary = (train_data['SalePrice'] > median_price).astype(int)

In [32]:
dt_param_grid = {
    'max_depth': [3, 5, 7, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random']
}

In [33]:
dt_grid = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    dt_param_grid,
    n_iter=50,  # Ограничиваем количество итераций
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
dt_grid.fit(X_classification, y_binary)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [34]:
print(f"Лучшие параметры Decision Tree: {dt_grid.best_params_}")
print(f"Лучшая accuracy: {dt_grid.best_score_:.4f}")

Лучшие параметры Decision Tree: {'splitter': 'best', 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': None, 'max_depth': 20, 'criterion': 'gini'}
Лучшая accuracy: 0.8849


RANDOM FOREST CLASSIFIER TUNING

In [35]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

In [36]:
rf_grid = RandomizedSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    rf_param_grid,
    n_iter=30,  # Ограничиваем количество итераций
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
rf_grid.fit(X_classification, y_binary)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [37]:
print(f"Лучшие параметры Random Forest: {rf_grid.best_params_}")
print(f"Лучшая accuracy: {rf_grid.best_score_:.4f}")

Лучшие параметры Random Forest: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 15, 'class_weight': 'balanced_subsample', 'bootstrap': False}
Лучшая accuracy: 0.9267
