In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("scout_cleaned.csv")

In [5]:
df.shape

(15915, 160)

In [7]:
X = df.drop(["price"], axis=1)

In [8]:
y = df["price"]

In [9]:
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    score = r2_score(actual, pred)
    return print("r2_score:", score, "\n","mae:", mae, "\n","mse:",mse, "\n","rmse:",rmse)

## Random Forest

In [14]:
# Gridsearch:
rf = RandomForestRegressor()

# hyper parameters
# "n_estimators" : number of trees - takes features every time random.
# "max_depth" : the size of tree
# "max_features" : number of columns
# "min_samples_split": the number of minimum sample to divide the node
rf_params = {"n_estimators": [50, 100, 300],
             "max_depth": [3, 5, 7],
            "max_features": [2, 4, 6, 8],
             "min_samples_split": [2, 4, 6]
            }

rf_cv_model = GridSearchCV(rf, rf_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  4.4min finished


In [15]:
rf_cv_model.best_params_

{'max_depth': 7,
 'max_features': 8,
 'min_samples_split': 2,
 'n_estimators': 100}

In [18]:
# rf_model = RandomForestClassifier(n_estimators = 300).fit(X_train, y_train)
rf_tuned = RandomForestRegressor(max_depth = 33,
                                  max_features = 50,
                                  min_samples_split = 2,
                                  n_estimators = 500).fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9781947297400577 
 mae: 541.1015105395688 
 mse: 1176777.6781572078 
 rmse: 1084.793841316039


## XGBoost

In [19]:
from xgboost import XGBRegressor

In [20]:
xgb = XGBRegressor()

xgb_params = {"n_estimators": [50, 100, 300],
             "subsample":[0.5,0.8,1],
             "max_depth":[3,5,7],
             "learning_rate":[0.1,0.01,0.3]}

In [21]:
xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 36.1min finished


In [22]:
xgb_cv_model.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}

In [23]:
# xgb_tuned = XGBClassifier().fit(X_train, y_train)
xgb_tuned = XGBRegressor(n_estimators=300,
                         subsample=0.8,
                         max_depth=7,
                         learning_rate=0.1).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9833655891915621 
 mae: 513.9736128701304 
 mse: 897718.9044350978 
 rmse: 947.4802923729326


In [24]:
# xgb_tuned = XGBClassifier().fit(X_train, y_train)
xgb_tuned = XGBRegressor(n_estimators=1000,
                         subsample=0.8,
                         max_depth=7,
                         learning_rate=0.1).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9847499426753916 
 mae: 451.89089391518417 
 mse: 823008.6963510273 
 rmse: 907.1982673875801


In [25]:
# xgb_tuned = XGBClassifier().fit(X_train, y_train)
xgb_tuned = XGBRegressor(n_estimators=1000,
                         subsample=0.8,
                         max_depth=33,
                         learning_rate=0.1).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9853421820988452 
 mae: 405.09885662724827 
 mse: 791046.9675883662 
 rmse: 889.4082120086177


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15915 entries, 0 to 15914
Columns: 160 entries, Unnamed: 0 to drive_chain_rear
dtypes: float64(10), int64(150)
memory usage: 19.4 MB


In [13]:
print(list(df.columns))

