In [43]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Random Forest 1 : all variables

In [2]:
# Inputs:
t_data  = pd.read_csv('data/t_dummies')
p_data = pd.read_csv('data/p_dummies')

In [3]:
# Import Model specific modules

# Calling Scaler
sc = StandardScaler()

# X and Y & Scalling
y = t_data['price']
X = sc.fit_transform(t_data.drop(columns=['price','id']))

# 1. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state=1)

# Fitting & Predicting in Train 
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)

# Predicting in Train
train_y_pred = regressor.predict(X_train)
test_y_pred = regressor.predict(X_test)
global_y_pred = regressor.predict(X)

# Evaluating Train Prediction
rmse_train = mean_squared_error(y_true= y_train, y_pred= train_y_pred, squared=False)
print(f'rmse_train:{rmse_train}')

rmse_test = mean_squared_error(y_true= y_test, y_pred= test_y_pred, squared=False)
print(f'rmse_test:{rmse_test}')

rmse_global = mean_squared_error(y_true= y, y_pred= global_y_pred, squared=False)
print(f'rmse_global:{rmse_global}')


# Generating output on predict.csv_data
regressor.fit(X, y)
regressor.fit(X, y)

X = sc.fit_transform(p_data.drop(columns=['id']))
y_pred = regressor.predict(X)

rmse_train:231.51117855204245
rmse_test:564.7473694670788
rmse_global:326.59739573309764


In [4]:
# Output1
output1 = pd.DataFrame(y_pred)
output1.rename(columns={0:'price'}, inplace=True)
output1.index.names = ['id']
output1.to_csv('output/output1')

In [5]:
t_data  = pd.read_csv('data/t_dummies')
p_data = pd.read_csv('data/p_dummies')

## Random Forest 2 : Removing x to reduce colinearity

In [6]:
t_data  = pd.read_csv('data/t_dummies_col')
p_data = pd.read_csv('data/p_dummies_col')

In [7]:
# Import Model specific modules

# Calling Scaler
sc = StandardScaler()

# X and Y & Scalling
y = t_data['price']
X = sc.fit_transform(t_data.drop(columns=['price','id']))

# 1. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state=1)

# Fitting & Predicting in Train 
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)

# Predicting in Train
train_y_pred = regressor.predict(X_train)
test_y_pred = regressor.predict(X_test)
global_y_pred = regressor.predict(X)

# Evaluating Train Prediction
rmse_train = mean_squared_error(y_true= y_train, y_pred= train_y_pred, squared=False)
print(f'rmse_train:{rmse_train}')

rmse_test = mean_squared_error(y_true= y_test, y_pred= test_y_pred, squared=False)
print(f'rmse_test:{rmse_test}')

rmse_global = mean_squared_error(y_true= y, y_pred= global_y_pred, squared=False)
print(f'rmse_global:{rmse_global}')


# Generating output on predict.csv_data
regressor.fit(X, y)
regressor.fit(X, y)

X = sc.fit_transform(p_data.drop(columns=['id']))
y_pred = regressor.predict(X)

rmse_train:229.67828909209354
rmse_test:564.4036460562791
rmse_global:325.4412209916582


In [8]:
# Output2
output2 = pd.DataFrame(y_pred)
output2.rename(columns={0:'price'}, inplace=True)
output2.index.names = ['id']
output2.to_csv('output/output2')

## Random Forest 3: Removing X and Z

In [9]:
t_data  = pd.read_csv('data/t_dummies_col')
p_data = pd.read_csv('data/p_dummies_col')

In [10]:
# Import Model specific modules

# Calling Scaler
sc = StandardScaler()

# X and Y & Scalling
y = t_data['price']
X = sc.fit_transform(t_data.drop(columns=['price','id','z']))

# 1. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .10, random_state=1)

# Fitting & Predicting in Train 
regressor = RandomForestRegressor(n_estimators=20, random_state=0, max_depth= 17)
regressor.fit(X_train, y_train)

# Predicting in Train
train_y_pred = regressor.predict(X_train)
test_y_pred = regressor.predict(X_test)
global_y_pred = regressor.predict(X)

# Evaluating Train Prediction
rmse_train = mean_squared_error(y_true= y_train, y_pred= train_y_pred, squared=False)
print(f'rmse_train:{rmse_train}')

rmse_test = mean_squared_error(y_true= y_test, y_pred= test_y_pred, squared=False)
print(f'rmse_test:{rmse_test}')

rmse_global = mean_squared_error(y_true= y, y_pred= global_y_pred, squared=False)
print(f'rmse_global:{rmse_global}')


# Generating output on predict.csv_data
regressor.fit(X, y)
regressor.fit(X, y)

X = sc.fit_transform(p_data.drop(columns=['id','z']))
y_pred = regressor.predict(X)

rmse_train:292.4195697763558
rmse_test:560.8203094032219
rmse_global:329.2613804939887


In [38]:
# Output2
output3 = pd.DataFrame(y_pred)
output3.rename(columns={0:'price'}, inplace=True)
output3.index.names = ['id']
output3.to_csv('output/output3')

## Adjusting Max Depth params manually for a ready to go solution

In [12]:
# Import Model specific modules


for depth in list(range(12,20)):
    print(depth)
# Calling Scaler
    sc = StandardScaler()

# X and Y & Scalling
    y = t_data['price']
    X = sc.fit_transform(t_data.drop(columns=['price','id','z']))

# 1. Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .10, random_state=1)

# Fitting & Predicting in Train 
    regressor = RandomForestRegressor(n_estimators=20, random_state=0, max_depth= depth)
    regressor.fit(X_train, y_train)

# Predicting in Train
    train_y_pred = regressor.predict(X_train)
    test_y_pred = regressor.predict(X_test)
    global_y_pred = regressor.predict(X)

# Evaluating Train Prediction
    rmse_train = mean_squared_error(y_true= y_train, y_pred= train_y_pred, squared=False)
    print(f'rmse_train:{rmse_train}')

    rmse_test = mean_squared_error(y_true= y_test, y_pred= test_y_pred, squared=False)
    print(f'rmse_test:{rmse_test}')

    rmse_global = mean_squared_error(y_true= y, y_pred= global_y_pred, squared=False)
    print(f'rmse_global:{rmse_global}')


12
rmse_train:487.77304961915166
rmse_test:584.0538483347638
rmse_global:498.24036203716764
13
rmse_train:432.8154082809027
rmse_test:569.0211583791322
rmse_global:448.30397529332674
14
rmse_train:388.51590103787424
rmse_test:559.2782995943667
rmse_global:408.8170334077544
15
rmse_train:350.48946890796566
rmse_test:558.8091483822353
rmse_global:376.5470603317221
16
rmse_train:318.4466450158363
rmse_test:560.715758405345
rmse_global:350.30028233119447
17
rmse_train:292.4195697763558
rmse_test:560.8203094032219
rmse_global:329.2613804939887
18
rmse_train:271.3485152803008
rmse_test:567.2727278572232
rmse_global:313.7672976299009
19
rmse_train:255.91280188081112
rmse_test:562.4727587425901
rmse_global:300.9699107290037


In [18]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
import numpy as np

In [19]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

## Fine Tuning Random Forest Params 

In [25]:
from sklearn.model_selection import GridSearchCV

param_grid = [
{'n_estimators': [10, 25], 'max_features': [5, 10], 
 'max_depth': [10, 50, None], 'bootstrap': [True, False]}
]

grid_search_forest = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search_forest.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=RandomForestRegressor(max_depth=19, n_estimators=20,
                                             random_state=0),
             param_grid=[{'bootstrap': [True, False],
                          'max_depth': [10, 50, None], 'max_features': [5, 10],
                          'n_estimators': [10, 25]}],
             scoring='neg_mean_squared_error')

In [27]:
cvres = grid_search_forest.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

850.5680434062261 {'bootstrap': True, 'max_depth': 10, 'max_features': 5, 'n_estimators': 10}
835.2736700981227 {'bootstrap': True, 'max_depth': 10, 'max_features': 5, 'n_estimators': 25}
713.3625832461244 {'bootstrap': True, 'max_depth': 10, 'max_features': 10, 'n_estimators': 10}
704.3944250165373 {'bootstrap': True, 'max_depth': 10, 'max_features': 10, 'n_estimators': 25}
638.1479115277159 {'bootstrap': True, 'max_depth': 50, 'max_features': 5, 'n_estimators': 10}
608.6540379687269 {'bootstrap': True, 'max_depth': 50, 'max_features': 5, 'n_estimators': 25}
598.5004356355074 {'bootstrap': True, 'max_depth': 50, 'max_features': 10, 'n_estimators': 10}
575.4670695632499 {'bootstrap': True, 'max_depth': 50, 'max_features': 10, 'n_estimators': 25}
638.1479115277159 {'bootstrap': True, 'max_depth': None, 'max_features': 5, 'n_estimators': 10}
608.6540379687269 {'bootstrap': True, 'max_depth': None, 'max_features': 5, 'n_estimators': 25}
598.5004356355074 {'bootstrap': True, 'max_depth': N

In [28]:
grid_search_forest.best_estimator_

RandomForestRegressor(bootstrap=False, max_depth=50, max_features=10,
                      n_estimators=25, random_state=0)

## Grid_Search Fine Tunned Model

In [31]:
# Import Model specific modules

# Calling Scaler
sc = StandardScaler()

# X and Y & Scalling
y = t_data['price']
X = sc.fit_transform(t_data.drop(columns=['price','id','z']))

# 1. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .10, random_state=1)

# Fitting & Predicting in Train 
regressor = RandomForestRegressor(bootstrap=False, max_depth=50, max_features=10,
                      n_estimators=25, random_state=25)
regressor.fit(X_train, y_train)

# Predicting in Train
train_y_pred = regressor.predict(X_train)
test_y_pred = regressor.predict(X_test)
global_y_pred = regressor.predict(X)

# Evaluating Train Prediction
rmse_train = mean_squared_error(y_true= y_train, y_pred= train_y_pred, squared=False)
print(f'rmse_train:{rmse_train}')

rmse_test = mean_squared_error(y_true= y_test, y_pred= test_y_pred, squared=False)
print(f'rmse_test:{rmse_test}')

rmse_global = mean_squared_error(y_true= y, y_pred= global_y_pred, squared=False)
print(f'rmse_global:{rmse_global}')


# Generating output on predict.csv_data
regressor.fit(X, y)
regressor.fit(X, y)

X = sc.fit_transform(p_data.drop(columns=['id','z']))
y_pred = regressor.predict(X)

rmse_train:0.1340485272688229
rmse_test:555.4732797694369
rmse_global:175.6669750668043


## Fintuned Model Adding Bootstrap to reduce overfitting

In [63]:
# Import Model specific modules

# Calling Scaler
sc = StandardScaler()

# X and Y & Scalling
y = t_data['price']
X = sc.fit_transform(t_data.drop(columns=['price','id','z']))

# 1. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .10, random_state=1)

# Fitting & Predicting in Train 
regressor = RandomForestRegressor(bootstrap=True, max_depth=50, max_features=10,
                      n_estimators=150, random_state=13)
regressor.fit(X_train, y_train)
# Output2
output3 = pd.DataFrame(y_pred)
output3.rename(columns={0:'price'}, inplace=True)
output3.index.names = ['id']
output3.to_csv('output/output2')
# Predicting in Train
train_y_pred = regressor.predict(X_train)
test_y_pred = regressor.predict(X_test)
global_y_pred = regressor.predict(X)

# Evaluating Train Prediction
rmse_train = mean_squared_error(y_true= y_train, y_pred= train_y_pred, squared=False)
print(f'rmse_train:{rmse_train}')

rmse_test = mean_squared_error(y_true= y_test, y_pred= test_y_pred, squared=False)
print(f'rmse_test:{rmse_test}')

rmse_global = mean_squared_error(y_true= y, y_pred= global_y_pred, squared=False)
print(f'rmse_global:{rmse_global}')


# Generating output on predict.csv_data

# Fitting the model with all training data
regressor.fit(X, y)
regressor.fit(X, y)

X = sc.fit_transform(p_data.drop(columns=['id','z']))
y_pred = regressor.predict(X)

rmse_train:207.07553282722475
rmse_test:536.3612946133123
rmse_global:259.54502138376955


In [40]:
# Output2
output4 = pd.DataFrame(y_pred)
output4.rename(columns={0:'price'}, inplace=True)
output4.index.names = ['id']
output4.to_csv('output/output4')

# Exploring optimization around fintuned model to reduce overfitting

In [100]:
# Import Model specific modules

# Calling Scaler
sc = StandardScaler()

# X and Y & Scalling
y = t_data['price']
X = sc.fit_transform(t_data.drop(columns=['price','id','z']))

# 1. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .10, random_state=1)

# Fitting & Predicting in Train 
regressor = RandomForestRegressor(bootstrap=True, max_depth=30, max_features=10,
                      n_estimators=500, random_state=13)
regressor.fit(X_train, y_train)
# Output2
output3 = pd.DataFrame(y_pred)
output3.rename(columns={0:'price'}, inplace=True)
output3.index.names = ['id']
output3.to_csv('output/output2')
# Predicting in Train
train_y_pred = regressor.predict(X_train)
test_y_pred = regressor.predict(X_test)
global_y_pred = regressor.predict(X)

# Evaluating Train Prediction
rmse_train = mean_squared_error(y_true= y_train, y_pred= train_y_pred, squared=False)
print(f'rmse_train:{rmse_train}')

rmse_test = mean_squared_error(y_true= y_test, y_pred= test_y_pred, squared=False)
print(f'rmse_test:{rmse_test}')

rmse_global = mean_squared_error(y_true= y, y_pred= global_y_pred, squared=False)
print(f'rmse_global:{rmse_global}')


# Generating output on predict.csv_data

# Fitting the model with all training data
regressor.fit(X, y)
regressor.fit(X, y)

X = sc.fit_transform(p_data.drop(columns=['id','z']))
y_pred = regressor.predict(X)
        

rmse_train:204.62164241188577
rmse_test:535.5286981092925
rmse_global:257.61432533531


In [101]:
# Output5
output5 = pd.DataFrame(y_pred)
output5.rename(columns={0:'price'}, inplace=True)
output5.index.names = ['id']
output5.to_csv('output/output5')