In [53]:
#basic imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly as px

In [54]:
# algo and and estimators
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [55]:
df =  pd.read_csv('../../datasets/diamond_pricing.csv')
df.head()

Unnamed: 0,Paleonium,Pressure,Price
0,17,6567,2810.280298
1,59,5253,1986.967089
2,123,9715,2083.132087
3,182,2073,2236.340285
4,133,6400,1903.323339


In [56]:
x = df[['Paleonium','Pressure']]
y = df['Price']

In [57]:
trainx,testx,trainy,testy = train_test_split(x,y,test_size=.2,random_state=0)

In [58]:
model = RandomForestRegressor()
model.fit(trainx,trainy)
model.score(testx,testy)*100

98.28222458168844

### checking the model for performance using cross validation

In [59]:
from sklearn.model_selection import cross_val_score

In [60]:
scores = cross_val_score(model,x,y,cv=5)
scores

array([0.9854187 , 0.97772327, 0.9866685 , 0.98183685, 0.98178719])

## testing cross validation on another model

In [73]:
df = pd.read_csv('../../datasets/house_pricing.csv')
X = df[['Beds','Baths','SquareFeet']]
scaler = StandardScaler()
scaledX = scaler.fit_transform(X)
y = df['Price']
trainx, testx, trainy, testy = train_test_split(scaledX,y,test_size=.2, random_state=0)
model2 = RandomForestRegressor()
model2.fit(trainx,trainy)
print('score:',model2.score(testx,testy) * 100)
scores = cross_val_score(model2,scaledX,y,cv=5)
print('cross validation:',scores)
print('validation score',scores.mean() * 100)

score: 82.35869528268431
cross validation: [0.79631465 0.70858158 0.67911361 0.50412789 0.62651779]
validation score 66.2931103665979


In [62]:
from sklearn.svm import SVR

In [68]:

df = pd.read_csv('../../datasets/house_pricing.csv')
X = df[['Beds','Baths','SquareFeet']]
y = df['Price']
trainx, testx, trainy, testy = train_test_split(X,y,test_size=.2, random_state=0)
model2 = SVR(kernel='linear')
model2.fit(trainx,trainy)
print('score:',model2.score(testx,testy) * 100)
scores = cross_val_score(model2,X,y,cv=5)
print('cross validation:',scores)
print('validation score',scores.mean() * 100)

score: 88.2129083768925
cross validation: [0.8977449  0.75262472 0.71825867 0.62053769 0.52122559]
validation score 70.20783152313051


## Grid Search for model performance tuning

In [74]:
from sklearn.model_selection import GridSearchCV

In [75]:

model_grid_options = {
    'n_estimators' : [10, 50, 100, 200, 500],
    'criterion' : ['mse','mae'],
    'max_depth' : [5, 10, 25]
}

In [76]:
grid = GridSearchCV(model2,model_grid_options,cv=3,n_jobs=-1)

In [77]:
grid.fit(X,y)

GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'], 'max_depth': [5, 10, 25],
                         'n_estimators': [10, 50, 100, 200, 500]})

In [78]:
grid.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_criterion', 'param_max_depth', 'param_n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [79]:
grid.cv_results_.get('rank_test_score')

array([ 9,  5, 10,  8,  7, 29, 16, 22, 23, 20, 30, 27, 26, 25, 24,  6,  1,
        4,  3,  2, 14, 11, 13, 15, 12, 28, 18, 21, 19, 17])

In [80]:
grid.best_estimator_

RandomForestRegressor(criterion='mae', max_depth=5, n_estimators=50)

In [88]:
model_final = RandomForestRegressor(criterion='mae', max_depth=5, n_estimators=200)
model_final.fit(trainx,trainy)

RandomForestRegressor(criterion='mae', max_depth=5, n_estimators=200)

In [81]:
RandomForestRegressor?

[1;31mInit signature:[0m
[0mRandomForestRegressor[0m[1;33m([0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'mse'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_split[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mbootstrap[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0moob_score[0m[1;33m=[0