## Train linear regression model to predict prices

In [69]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR, MODELS_DIR

In [71]:
data = pd.read_csv(TRANSFORMED_DATA_DIR / 'transformed_data_for_lr.csv')

In [72]:
data.shape

(4817, 137)

In [73]:
# Change data types from object to categorical
from src.data import convert_object_columns_to_category

data = convert_object_columns_to_category(data)

Split data into train and test

In [74]:
from src.data import get_train_test_data

In [75]:
data_columns = data.columns

In [76]:
model_keys = [s for s in data_columns if 'model_key' in s]
fuels = [s for s in data_columns if 'fuel' in s]
paint_colors = [s for s in data_columns if 'paint_color' in s]
car_types = [s for s in data_columns if 'car_type' in s]
month_sold_ats = [s for s in data_columns if 'month_sold_at' in s]
season_sold_ats = [s for s in data_columns if 'season_sold_at' in s]
model_initials = [s for s in data_columns if 'model_initial' in s]

In [77]:
features = ['mileage', 'engine_power', 'feature_1',
    'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
    'feature_7', 'feature_8', 'age_in_months_when_sold'] + \
        model_keys + fuels + paint_colors + car_types + month_sold_ats + season_sold_ats + model_initials
target = 'price'

In [78]:
# there should be 137 - 4 = 133 features
len(features)

133

In [79]:
# Save the features and target to models directory
import joblib


features_path = MODELS_DIR / 'features.pkl'
target_path = MODELS_DIR / 'target.pkl'

joblib.dump(features, features_path)
joblib.dump(target, target_path)

['C:\\Users\\ciroalfonsom\\learning\\cars-price\\models\\target.pkl']

In [80]:
X, y, X_train, X_test, y_train, y_test = get_train_test_data(data, features, target)

In [81]:
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")
print(f"Train shape: {y_train.shape}")
print(f"Test shape: {y_test.shape}")

Train shape: (3853, 133)
Test shape: (964, 133)
Train shape: (3853,)
Test shape: (964,)


Train model

In [82]:
from sklearn.linear_model import LinearRegression

In [83]:
regressor = LinearRegression()

In [84]:
regressor.fit(X_train, y_train)

In [85]:
pred_train = regressor.predict(X_train)

In [86]:
pred_test = regressor.predict(X_test)

Calculate accuracy metrics

In [87]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

In [88]:
# Calculate metrics on train set
mse_train = mean_squared_error(y_train, pred_train)
rmse_train = root_mean_squared_error(y_train, pred_train)
mae_train = mean_absolute_error(y_train, pred_train)
r2_train = r2_score(y_train, pred_train)

In [89]:
# Calculate metrics on test set
mse_test = mean_squared_error(y_test, pred_test)
rmse_test = root_mean_squared_error(y_test, pred_test)
mae_test = mean_absolute_error(y_test, pred_test)
r2_test = r2_score(y_test, pred_test)

In [90]:
# Print metrics on train set
print(f"Train MSE: {mse_train}")
print(f"Train RMSE: {rmse_train}")
print(f"Train MAE: {mae_train}")
print(f"Train R2: {r2_train}")

Train MSE: 22242775.458663937
Train RMSE: 4716.22470400467
Train MAE: 2356.7496063647104
Train R2: 0.7468706209043934


In [91]:
# Print metrics on test set
print(f"Test MSE: {mse_test}")
print(f"Test RMSE: {rmse_test}")
print(f"Test MAE: {mae_test}")
print(f"Test R2: {r2_test}")

Test MSE: 11222871.672975114
Test RMSE: 3350.0554731190814
Test MAE: 2259.4956752301623
Test R2: 0.8298267612623653


Find feature importances

In [92]:
importances = regressor.coef_

In [93]:
# Create a DataFrame with feature names and importances
feature_importance_df = pd.DataFrame(zip(features, importances), columns=['feature', 'importance'])

In [94]:
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

In [95]:
feature_importance_df

Unnamed: 0,feature,importance
85,model_key_i8,41915.689401
132,model_initial_i,18114.456595
72,model_key_M4,17419.646513
82,model_key_X6 M,12244.834983
62,model_key_640 Gran Coupé,10403.508345
...,...,...
70,model_key_M235,-9023.072766
76,model_key_X3,-9259.586201
75,model_key_X1,-12816.407426
63,model_key_650,-17718.921801


In [96]:
# Save feature importances to model directory
joblib.dump(feature_importance_df, MODELS_DIR / 'feature_importance_df.pkl')

['C:\\Users\\ciroalfonsom\\learning\\cars-price\\models\\feature_importance_df.pkl']

Save the model

In [97]:
# Retrain the model with all the data
regressor.fit(X, y)

In [98]:
# Save the model
joblib.dump(regressor, MODELS_DIR / 'model.pkl')

['C:\\Users\\ciroalfonsom\\learning\\cars-price\\models\\model.pkl']

In [99]:
# Save metadata about the model
metadata = {
    'mse_train': mse_train,
    'rmse_train': rmse_train,
    'mae_train': mae_train,
    'r2_train': r2_train,
    'mse_test': mse_test,
    'rmse_test': rmse_test,
    'mae_test': mae_test,
    'r2_test': r2_test,
    'features': features,
    'target': target,
    'name': 'Linear regression model',
}
import json
with open(MODELS_DIR / 'metadata.json', 'w') as f:
    json.dump(metadata, f)