In [11]:
import numpy as np
import pandas as pd
import functions as func
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import export_text
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

%matplotlib inline

db_locations = func.import_yaml()

#load df
#df = pd.read_csv(db_locations['data_raw']['train'])
df = pd.read_csv(db_locations['data_clean']['train_merged'])

#### Data Cleaning
We used the cleaned data: db_locations['data_clean']['train_merged']

#### Feature Engineering

In [19]:
# One-hot encoding for ext_col', 'int_col'
df = pd.get_dummies(df, columns=['ext_col', 'int_col'])

#### Feature Selection

Predictors:
- brand: 57 brands: Ford (12%), Merc (10%), BMW (9%), Chev (8%)
- model 
- model_year (numerical)
- milage (numerical)
- fuel_type: Diesel, Electric, Gasoline, Hybrid
- engine
    - horsepower (numerical)
    - engine_size (numerical)
    - cylinders (numerical)
- transmission: Automatic, CVT, Manual, Other
- ext_col: 16 colors: Black, White, Gray, Silver, etc
- int_col: 13 colors: Black, Beige, Gray, etc
- accident (dummy)
- clean_title (dummy)

Target:
- price

In [18]:
# Create correlation matrix
corr_matrix = np.abs(df.corr()) # corr(x,y) = corr(y, x), corr(x,x) = 1

ValueError: could not convert string to float: 'Yellow'

In [13]:
threshold = 0.5

# Create a mask to avoid duplicate pairs (since the matrix is symmetric)
# Only keep the upper triangle of the matrix
upper_triangle = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)

# Apply the mask to the correlation matrix and filter based on the threshold
filtered_corr = corr_matrix.where(upper_triangle)  # Keep upper triangle

# Find pairs with correlation higher than the threshold
high_corr_pairs = filtered_corr.stack().reset_index()

# Rename the columns for clarity
high_corr_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']

# Filter the pairs based on the threshold
high_corr_pairs = high_corr_pairs[high_corr_pairs['Correlation'].abs() > threshold]

NameError: name 'corr_matrix' is not defined

In [4]:
high_corr_pairs.sort_values(by='Correlation', ascending=False)

NameError: name 'high_corr_pairs' is not defined

In [320]:
# Checking features which has high correlation with Price
corr_matrix["price"].sort_values(ascending=False).head(40)

price                     1.000000
milage                    0.284189
horsepower                0.276135
model_year                0.236145
cylinders                 0.132266
accident                  0.125423
engine_size               0.096972
brand_Lamborghini         0.096535
clean_title               0.089867
brand_Bentley             0.085501
brand_Porsche             0.083633
ext_col_Others            0.074695
int_col_Gray              0.068767
int_col_Beige             0.067914
brand_Rolls-Royce         0.063265
int_col_Others            0.052033
fuel_type_Gasoline        0.044026
transmission_Automatic    0.043217
ext_col_Silver            0.042090
brand_Toyota              0.041665
brand_Ferrari             0.040802
transmission_Manual       0.038521
int_col_Black             0.036528
brand_McLaren             0.036367
fuel_type_Electric        0.035577
int_col_Red               0.035465
brand_Subaru              0.032303
brand_Mercedes-Benz       0.031583
brand_Aston         

In [20]:
df.dropna(inplace=True)

#### Train Test Split

In [22]:
features = df[['brand_Lamborghini','brand_Bentley','brand_Porsche','brand_Rolls-Royce','brand_Toyota','brand_Ferrari','brand_McLaren','brand_Mercedes-Benz','brand_Aston','brand_Honda','brand_Mazda','brand_Land','brand_Lexus','model_year','milage','horsepower','fuel_type_Gasoline','transmission_Automatic','ext_col_Silver','int_col_Beige','accident','clean_title']]
target = df['price']

In [390]:
#features = df[['model_year','milage','horsepower','fuel_type_Gasoline','transmission_Automatic','accident','clean_title']]
#target = df['price']

In [23]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

#### Normalization

In [427]:
normalizer = MinMaxScaler()
normalizer.fit(x_train)
x_train_norm = normalizer.transform(x_train)
x_test_norm = normalizer.transform(x_test)

In [428]:
x_train_norm = pd.DataFrame(x_train_norm, columns = x_train.columns)
x_test_norm = pd.DataFrame(x_test_norm, columns = x_test.columns)

#### Regression Model & Evaluation
- KNN Regression (Aisyah)
- Linear Regression (Aisyah)
- Decision Trees (Paola)
- Bagging and Pasting (Paola)
- Random Forest (Flory)
- Gradient Boosting (Flory)
- Adaptive Boosting (Paola)

**KNN Regression**

In [250]:
knn5 = KNeighborsRegressor(n_neighbors=5)
knn5.fit(x_train, y_train)
pred = knn5.predict(x_train)
print("MAE", round(mean_absolute_error(pred, y_train),2))
print("RMSE", round(mean_squared_error(pred, y_train, squared=False),2))
print(f"R2: {knn5.score(x_train, y_train): .2f}")

MAE 15343.22642271021
RMSE 54536.190173016235
R2:  0.35


In [251]:
pred = knn5.predict(x_test)
print("MAE", round(mean_absolute_error(pred, y_test),2))
print("RMSE", round(mean_squared_error(pred, y_test, squared=False),2))
print(f"R2: {knn5.score(x_test, y_test): .2f}")

MAE 18736.61541425819
RMSE 68115.88078192051
R2:  0.01


In [252]:
knn8 = KNeighborsRegressor(n_neighbors=8)
knn8.fit(x_train, y_train)
pred = knn8.predict(x_train)
print("MAE", round(mean_absolute_error(pred, y_train),2))
print("RMSE", round(mean_squared_error(pred, y_train, squared=False),2))
print(f"R2: {knn8.score(x_train, y_train): .2f}")

MAE 16235.936362164048
RMSE 57577.9527119675
R2:  0.27


In [253]:
pred = knn8.predict(x_test)
print("MAE", round(mean_absolute_error(pred, y_test),2))
print("RMSE", round(mean_squared_error(pred, y_test, squared=False),2))
print(f"R2: {knn8.score(x_test, y_test): .2f}")

MAE 18447.614739053883
RMSE 66724.60583957477
R2:  0.05


In [254]:
knn10 = KNeighborsRegressor(n_neighbors=10)
knn10.fit(x_train, y_train)
pred = knn10.predict(x_train)
print("MAE", round(mean_absolute_error(pred, y_train),2))
print("RMSE", round(mean_squared_error(pred, y_train, squared=False),2))
print(f"R2: {knn10.score(x_train, y_train): .2f}")

MAE 16488.86688316003
RMSE 58476.61960234941
R2:  0.25


In [255]:
pred = knn10.predict(x_test)
print("MAE", round(mean_absolute_error(pred, y_test),2))
print("RMSE", round(mean_squared_error(pred, y_test, squared=False),2))
print(f"R2: {knn10.score(x_test, y_test): .2f}")

MAE 18270.39
RMSE 65993.76
R2:  0.07


In [266]:
knn10 = KNeighborsRegressor(n_neighbors=10)
knn10.fit(x_train_norm, y_train)
pred = knn10.predict(x_train_norm)
print("MAE", round(mean_absolute_error(pred, y_train),2))
print("RMSE", round(mean_squared_error(pred, y_train, squared=False),2))
print(f"R2: {knn10.score(x_train_norm, y_train): .2f}")

MAE 16192.93
RMSE 58489.56
R2:  0.25


In [267]:
pred = knn10.predict(x_test_norm)
print("MAE", round(mean_absolute_error(pred, y_test),2))
print("RMSE", round(mean_squared_error(pred, y_test, squared=False),2))
print(f"R2: {knn10.score(x_test_norm, y_test): .2f}")

MAE 17621.76
RMSE 65671.77
R2:  0.08


**Linear Regression**

In [346]:
lin_reg = LinearRegression()
lin_reg.fit(x_train_norm, y_train)
pred = lin_reg.predict(x_train_norm)
print("MAE", round(mean_absolute_error(pred, y_train),2))
print("RMSE", round(mean_squared_error(pred, y_train, squared=False),2))
print("R2", round(lin_reg.score(x_train_norm, y_train),2))

MAE 18905.0
RMSE 63030.71
R2 0.13


In [347]:
lin_reg = LinearRegression()
lin_reg.fit(x_test_norm, y_test)
pred = lin_reg.predict(x_test_norm)
print("MAE", round(mean_absolute_error(pred, y_test),2))
print("RMSE", round(mean_squared_error(pred, y_test, squared=False),2))
print("R2", round(lin_reg.score(x_test_norm, y_test),2))

MAE 18764.52
RMSE 63775.98
R2 0.13


In [269]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
pred = lin_reg.predict(x_train)
print("MAE", round(mean_absolute_error(pred, y_train),2))
print("RMSE", round(mean_squared_error(pred, y_train, squared=False),2))
print("R2", round(lin_reg.score(x_train, y_train),4))

MAE 18905.0
RMSE 63030.71
R2 0.1285


In [257]:
lin_reg = LinearRegression()
lin_reg.fit(x_test, y_test)
pred = lin_reg.predict(x_test)
print("MAE", round(mean_absolute_error(pred, y_test),2))
print("RMSE", round(mean_squared_error(pred, y_test, squared=False),2))
print("R2", round(lin_reg.score(x_test, y_test),4))

MAE 18764.52
RMSE 63775.98
R2 0.1306


In [258]:
lin_reg_coef = {feature : coef for feature, coef in zip(x_train.columns, lin_reg.coef_)}
lin_reg_coef

{'brand_Lamborghini': 62059.85921591837,
 'brand_Bentley': 41683.39125625693,
 'brand_Porsche': 17563.75176351432,
 'brand_Rolls-Royce': 56581.32347680951,
 'brand_Toyota': 4503.788740985932,
 'brand_Ferrari': 34373.70147518913,
 'brand_McLaren': 30045.481180099894,
 'brand_Mercedes-Benz': 3270.860652314882,
 'brand_Aston': 113723.18945526963,
 'brand_Honda': 6132.627299627769,
 'brand_Mazda': 10665.421200089699,
 'brand_Land': 6109.0677728725395,
 'brand_Lexus': 3347.5232505647077,
 'model_year': 851.5258860083977,
 'milage': -0.23685739205609482,
 'horsepower': 89.32676358828957,
 'fuel_type_Gasoline': -4223.286589988465,
 'transmission_Automatic': -39.18627822760944,
 'ext_col_Silver': -3623.038502456024,
 'int_col_Beige': -1825.0155948654533,
 'accident': -3588.754323168338,
 'clean_title': -9665.840653856172}

In [259]:
est = sm.OLS(y_train, x_train)
est2 = est.fit()
print(est2.summary())

                                 OLS Regression Results                                
Dep. Variable:                  price   R-squared (uncentered):                   0.344
Model:                            OLS   Adj. R-squared (uncentered):              0.344
Method:                 Least Squares   F-statistic:                              2872.
Date:                Thu, 19 Sep 2024   Prob (F-statistic):                        0.00
Time:                        11:25:18   Log-Likelihood:                     -1.5016e+06
No. Observations:              120404   AIC:                                  3.003e+06
Df Residuals:                  120382   BIC:                                  3.004e+06
Df Model:                          22                                                  
Covariance Type:            nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

**Decision Trees**

In [436]:
tree = DecisionTreeRegressor(max_depth=5)
tree.fit(x_train_norm, y_train)

DecisionTreeRegressor(max_depth=5)

In [437]:
pred = tree.predict(x_train_norm)

print(f"MAE, {mean_absolute_error(pred, y_train): .2f}")
print(f"RMSE, {mean_squared_error(pred, y_train, squared=False): .2f}")
print(f"R2 score, {tree.score(x_train_norm, y_train): .2f}")

MAE,  17086.12
RMSE,  62104.66
R2 score,  0.15


In [438]:
pred = tree.predict(x_test_norm)

print(f"MAE, {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE, {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score, {tree.score(x_test_norm, y_test): .2f}")

MAE,  17058.02
RMSE,  63677.51
R2 score,  0.13


In [439]:
tree_importance = {feature : importance for feature, importance in zip(x_train_norm.columns, tree.feature_importances_)}
tree_importance

{'brand_Lamborghini': 0.0,
 'brand_Bentley': 0.0,
 'brand_Porsche': 0.022372575422196003,
 'brand_Rolls-Royce': 0.0,
 'brand_Toyota': 0.0,
 'brand_Ferrari': 0.0,
 'brand_McLaren': 0.0,
 'brand_Mercedes-Benz': 0.0,
 'brand_Aston': 0.0,
 'brand_Honda': 0.0,
 'brand_Mazda': 0.0,
 'brand_Land': 0.0,
 'brand_Lexus': 0.0,
 'model_year': 0.08043960683725168,
 'milage': 0.7432303609744664,
 'horsepower': 0.15395745676608588,
 'fuel_type_Gasoline': 0.0,
 'transmission_Automatic': 0.0,
 'ext_col_Silver': 0.0,
 'int_col_Beige': 0.0,
 'accident': 0.0,
 'clean_title': 0.0}

In [440]:
tree_viz = export_text(tree, feature_names=list(x_train_norm.columns))
print(tree_viz)

|--- milage <= 0.09
|   |--- horsepower <= 0.54
|   |   |--- milage <= 0.03
|   |   |   |--- horsepower <= 0.39
|   |   |   |   |--- model_year <= 0.91
|   |   |   |   |   |--- value: [51433.34]
|   |   |   |   |--- model_year >  0.91
|   |   |   |   |   |--- value: [70524.62]
|   |   |   |--- horsepower >  0.39
|   |   |   |   |--- brand_Porsche <= 0.50
|   |   |   |   |   |--- value: [78141.50]
|   |   |   |   |--- brand_Porsche >  0.50
|   |   |   |   |   |--- value: [104390.53]
|   |   |--- milage >  0.03
|   |   |   |--- horsepower <= 0.42
|   |   |   |   |--- model_year <= 0.91
|   |   |   |   |   |--- value: [44728.30]
|   |   |   |   |--- model_year >  0.91
|   |   |   |   |   |--- value: [56430.45]
|   |   |   |--- horsepower >  0.42
|   |   |   |   |--- brand_Porsche <= 0.50
|   |   |   |   |   |--- value: [62716.26]
|   |   |   |   |--- brand_Porsche >  0.50
|   |   |   |   |   |--- value: [87973.59]
|   |--- horsepower >  0.54
|   |   |--- milage <= 0.02
|   |   |   |--- br

**Bagging and Pasting**

In [493]:
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=5),
                               n_estimators=100,
                               max_samples = 1000)

In [494]:
bagging_reg.fit(x_train_norm, y_train)

BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),
                 max_samples=1000, n_estimators=100)

In [495]:
pred = bagging_reg.predict(x_train_norm)

print(f"MAE {mean_absolute_error(pred, y_train): .2f}")
print(f"RMSE {mean_squared_error(pred, y_train, squared=False): .2f}")
print(f"R2 score {bagging_reg.score(x_train_norm, y_train): .2f}")

MAE  16999.82
RMSE  62138.30
R2 score  0.15


In [496]:
pred = bagging_reg.predict(x_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {bagging_reg.score(x_test_norm, y_test): .2f}")

MAE  16979.84
RMSE  63378.77
R2 score  0.14


**Random Patches**

In [497]:
forest = RandomForestRegressor(n_estimators=100,max_depth=5)

In [498]:
forest.fit(x_train_norm, y_train)

RandomForestRegressor(max_depth=5)

In [499]:
pred = forest.predict(x_train_norm)

print(f"MAE, {mean_absolute_error(pred, y_train): .2f}")
print(f"RMSE, {mean_squared_error(pred, y_train, squared=False): .2f}")
print(f"R2 score, {forest.score(x_train_norm, y_train): .2f}")

MAE,  16720.68
RMSE,  61442.69
R2 score,  0.17


In [500]:
pred = forest.predict(x_test_norm)

print(f"MAE, {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE, {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score, {forest.score(x_test_norm, y_test): .2f}")

MAE,  16689.23
RMSE,  63194.52
R2 score,  0.15


**Ada Boost**

In [501]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=5),n_estimators=100)

In [502]:
ada_reg.fit(x_train_norm, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),
                  n_estimators=100)

In [503]:
pred = ada_reg.predict(x_train_norm)

print(f"MAE, {mean_absolute_error(pred, y_train): .2f}")
print(f"RMSE, {mean_squared_error(pred, y_train, squared=False): .2f}")
print(f"R2 score, {ada_reg.score(x_train_norm, y_train): .2f}")

MAE,  104136.53
RMSE,  199123.50
R2 score, -7.70


In [504]:
pred = ada_reg.predict(x_test_norm)

print(f"MAE, {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE, {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score, {ada_reg.score(x_test_norm, y_test): .2f}")

MAE,  105777.62
RMSE,  204248.22
R2 score, -7.92


**Gradient Boosting**

In [505]:
gb_reg = GradientBoostingRegressor(max_depth=5,n_estimators=100)

In [506]:
gb_reg.fit(x_train_norm, y_train)

GradientBoostingRegressor(max_depth=5)

In [507]:
pred = gb_reg.predict(x_train_norm)

print(f"MAE, {mean_absolute_error(pred, y_train): .2f}")
print(f"RMSE, {mean_squared_error(pred, y_train, squared=False): .2f}")
print(f"R2 score, {gb_reg.score(x_train_norm, y_train): .2f}")

MAE,  15732.42
RMSE,  55245.34
R2 score,  0.33


In [509]:
pred = gb_reg.predict(x_test_norm)

print(f"MAE, {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE, {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score, {gb_reg.score(x_test_norm, y_test): .2f}")

MAE,  16205.48
RMSE,  63809.18
R2 score,  0.13


**Hyperparameter Tuning**

In [531]:
grid = {"max_leaf_nodes": [25,50,100],
        "max_depth":[5,10,30]}

In [532]:
dt = DecisionTreeRegressor()

In [533]:
model = GridSearchCV(estimator = dt, param_grid = grid, cv=5, verbose=10) # The "cv" option here is used to provide the desired number of folds K.

In [534]:
model.fit(x_train_norm, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5; 1/9] START max_depth=5, max_leaf_nodes=25..............................
[CV 1/5; 1/9] END max_depth=5, max_leaf_nodes=25;, score=0.103 total time=   0.2s
[CV 2/5; 1/9] START max_depth=5, max_leaf_nodes=25..............................
[CV 2/5; 1/9] END max_depth=5, max_leaf_nodes=25;, score=0.145 total time=   0.1s
[CV 3/5; 1/9] START max_depth=5, max_leaf_nodes=25..............................
[CV 3/5; 1/9] END max_depth=5, max_leaf_nodes=25;, score=0.187 total time=   0.1s
[CV 4/5; 1/9] START max_depth=5, max_leaf_nodes=25..............................
[CV 4/5; 1/9] END max_depth=5, max_leaf_nodes=25;, score=0.116 total time=   0.2s
[CV 5/5; 1/9] START max_depth=5, max_leaf_nodes=25..............................
[CV 5/5; 1/9] END max_depth=5, max_leaf_nodes=25;, score=0.138 total time=   0.2s
[CV 1/5; 2/9] START max_depth=5, max_leaf_nodes=50..............................
[CV 1/5; 2/9] END max_depth=5, max_leaf_node

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [5, 10, 30],
                         'max_leaf_nodes': [25, 50, 100]},
             verbose=10)

In [535]:
model.best_params_

{'max_depth': 5, 'max_leaf_nodes': 50}

In [536]:
best_model = model.best_estimator_

In [537]:
pred = best_model.predict(x_train_norm)

print(f"MAE: {mean_absolute_error(pred, y_train): .2f}")
print(f"RMSE: {mean_squared_error(pred, y_train, squared=False): .2f}")
print(f"R2 score:  {best_model.score(x_train_norm, y_train): .2f}")

MAE:  17086.12
RMSE:  62104.66
R2 score:   0.15


In [538]:
pred = best_model.predict(x_test_norm)

print(f"MAE: {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE: {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score:  {best_model.score(x_test_norm, y_test): .2f}")

MAE:  17058.02
RMSE:  63677.51
R2 score:   0.13
