In [51]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import joblib

In [52]:
# Загружаем данные
df = pd.read_csv('../datasets/cars_dataset.csv').set_index('id')
df = df.dropna()

keep_cols = [
    'Mark', 'Model', 'Generation', 'Car_type', 'Restyling',
    'Fuel', 'Drive', 'Transmissions', 'Boost_type',
    'Engine_volume',
    'Year', 'Price'
]

df = df[keep_cols]

df['full_name'] = df['Mark'] + ' ' + df['Model'] + ' ' + df['Generation']
df.drop(columns=['Mark', 'Model', 'Generation'], inplace=True)
print(df.shape)
df.head()

(17194, 10)


Unnamed: 0_level_0,Car_type,Restyling,Fuel,Drive,Transmissions,Boost_type,Engine_volume,Year,Price,full_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,SUV,0.0,AI-95,full,9.0,turbocharging,3982.0,2019.0,13890000.0,Mercedes-Benz G-Class III (W463) 2018 – now
3,SUV,0.0,AI-95,full,9.0,turbocharging,3982.0,2020.0,13116000.0,Mercedes-Benz G-Class III (W463) 2018 – now
4,SUV,0.0,AI-95,full,9.0,turbocharging,3982.0,2019.0,11450000.0,Mercedes-Benz G-Class III (W463) 2018 – now
8,SUV,0.0,AI-95,full,9.0,turbocharging,3982.0,2020.0,18545400.0,Mercedes-Benz G-Class III (W463) 2018 – now
11,SUV,0.0,AI-95,full,9.0,turbocharging,3982.0,2020.0,15048000.0,Mercedes-Benz G-Class III (W463) 2018 – now


In [53]:
models = {}
metrics = {}
categorical_features = [
    'Car_type', 'Restyling',
    'Fuel', 'Drive', 'Transmissions', 'Boost_type'
]
for col in categorical_features:
    df[col] = df[col].astype('category').cat.codes



In [54]:
all_mae = 0
all_r = 0

for name, group in df.groupby('full_name'):
    if len(group) < 10:
        print(f"\n⚠️ Пропуск: {name} (слишком мало данных.)")
        continue

    print(f"\n🚗 Обучаем модель для: {name} ({len(group)} записей)")

    X = group.drop(columns=['Price', 'full_name'])
    y = group['Price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestRegressor(
        n_estimators=200,
        max_depth=20,
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    metrics[name] = {"r2": r2, "mae": mae}


    print(f"✅ R² = {r2:.3f}, MAE = {mae:.0f} руб.")

    models[name] = model
    joblib.dump(model, f"./models/model_{name.replace('/', '_').replace(' ', '_')}.pkl")
    all_mae += mae
    all_r += r2

print("\n=== Обучение завершено! ===")
print(f"Количество обученных моделей: {len(models)}")
print(f"Средняя MAE: {all_mae / len(models):.0f} руб.")
print(f"Среднее R²: {all_r / len(models):.3f}")

# === 7. Сохраняем метрики в файл ===
pd.DataFrame(metrics).T.to_csv('submodel_metrics.csv')


⚠️ Пропуск: AMC  Eagle  1979 – 1987  (слишком мало данных.)

⚠️ Пропуск: Acura  MDX  III 2013 – 2015  (слишком мало данных.)

🚗 Обучаем модель для: Acura  RDX  I 2006 – 2009  (22 записей)
✅ R² = 0.588, MAE = 97886 руб.

⚠️ Пропуск: Acura  RDX  I Restyling 2009 – 2012  (слишком мало данных.)

⚠️ Пропуск: Acura  RDX  II 2013 – 2015  (слишком мало данных.)

⚠️ Пропуск: Acura  TLX  I 2014 – 2017  (слишком мало данных.)

⚠️ Пропуск: Alfa Romeo  147  I Restyling 2004 – 2010  (слишком мало данных.)

⚠️ Пропуск: Alfa Romeo  159  2005 – 2011  (слишком мало данных.)

⚠️ Пропуск: Alfa Romeo  Brera  2006 – 2010  (слишком мало данных.)

⚠️ Пропуск: Aston Martin  DB11  I 2016 – now  (слишком мало данных.)

🚗 Обучаем модель для: Audi  100  III (C3) 1982 – 1988  (16 записей)
✅ R² = -0.717, MAE = 85839 руб.

🚗 Обучаем модель для: Audi  100  IV (C4) 1990 – 1994  (26 записей)
✅ R² = 0.135, MAE = 21283 руб.

🚗 Обучаем модель для: Audi  80  IV (B3) 1986 – 1991  (14 записей)
✅ R² = -2.005, MAE = 49843 руб.

In [55]:
# from sklearn.preprocessing import LabelEncoder

# cat_cols = ['full_name', 'restyling', 'fuel_t',
#             'transmission', 'drive', 'body-type', 'steering', 'equipment']

# encoders = {}
# for col in cat_cols:
#     le = LabelEncoder()
#     df[col] = le.fit_transform(df[col].astype(str))
#     encoders[col] = le  

In [56]:
# from sklearn.model_selection import train_test_split

# X = df.drop(columns=['price'])
# y = df['price']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [57]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor

# param_grid = {
#     'n_estimators': [100],
#     'random_state': [i for i in range(42, 52)],
#     'max_depth': [10, 15, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2', 0.7],
#     'bootstrap': [True, False]
# }

# rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# grid = GridSearchCV(estimator=rf, param_grid=param_grid,
#                     scoring='r2', cv=3, n_jobs=-1, verbose=2)

# grid.fit(X_train, y_train)
# print("Лучшие параметры:", grid.best_params_)
# print("Лучший R²:", grid.best_score_)


In [58]:

# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_error, r2_score


# rf = RandomForestRegressor(
#     n_estimators=100, 
#     bootstrap=False, 
#     random_state=5,
#     max_depth=15, 
#     max_features=0.7,
#     min_samples_leaf=4,
#     min_samples_split=2)
# rf.fit(X_train, y_train)

# y_pred = rf.predict(X_test)
# print("MAE:", mean_absolute_error(y_test, y_pred))
# print("R2 :", r2_score(y_test, y_pred))


In [59]:
# y_pred = rf.predict(X_test)

# print("MAE:", mean_absolute_error(y_test, y_pred))
# print("R2 :", r2_score(y_test, y_pred))


In [60]:
# import matplotlib.pyplot as plt

# feat_imp = pd.Series(rf.feature_importances_, index=X.columns)
# feat_imp.sort_values(ascending=True).plot(kind='barh', figsize=(10,6))
# plt.title("Важность признаков (Random Forest)")
# plt.show()


In [61]:
# import joblib

# joblib.dump(rf, 'car_price_model.pkl')
# joblib.dump(encoders, 'label_encoders.pkl')
