In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [None]:
def evaluation_metrics(y_test,y_pred):
    rmse = mean_squared_error(y_test,y_pred,squared = False)
    mae = mean_absolute_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)
    return rmse,mae,r2

In [None]:
spark_df = spark.read.load('/FileStore/tables/consolidate/merged_df.csv', format = 'csv', header = True, inferSchema = True)

df = spark_df.toPandas()
df.head()

Unnamed: 0,Year,Store,Store_Type,Dept,Temperature,Fuel_Price,CPI,Unemployment,Total_Sales
0,2012,1,A,1,86.11,3.417,221.949864,6.908,16628.31
1,2011,1,A,2,75.64,3.899,215.964053,7.682,44274.15
2,2010,1,A,3,46.63,2.561,211.319643,8.106,11135.17
3,2012,1,A,6,75.55,3.749,221.6718,7.143,4658.18
4,2012,1,A,6,68.55,3.617,223.181477,6.573,3843.29


In [None]:
X = df.drop('Total_Sales', axis = 1)
y = df['Total_Sales']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.head(2)

Unnamed: 0,Year,Store,Store_Type,Dept,Temperature,Fuel_Price,CPI,Unemployment
138466,2010,37,C,93,79.93,2.705,209.939809,8.464
289214,2010,26,A,42,61.65,2.906,132.293936,8.512


In [None]:
# Encoding Categorical Columns

transformer = ColumnTransformer([('trans1',OrdinalEncoder(categories=[['A','B','C']]),[2])], remainder = 'passthrough')

x_train_encoded = transformer.fit_transform(X_train)
x_test_encoded = transformer.transform(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV

# Defining a Base model
base_model = RandomForestRegressor(random_state = 42)

# Make a dictionary of hyperparameters values to search
search_space = {"n_estimators":[100,150]}

# making an GridSearchCV Object
GS = GridSearchCV(estimator = base_model,
                  param_grid = search_space,
                  scoring = 'r2',
                  refit = 'r2', # this will return the model that is tested with the r2 metrices
                  cv = 3,
                  verbose = 4)

GS.fit(x_train_encoded,y_train)

best_params = GS.best_params_ # To get only the best hyperparameter values that we searched for
n_est = best_params.values()
print('n_estimator',n_est)

best_score = GS.best_score_ # score according to the metric we passes in refit
print('R2 Score :',best_score)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3] END ..................n_estimators=100;, score=0.935 total time= 2.2min
[CV 2/3] END ..................n_estimators=100;, score=0.926 total time= 2.1min
[CV 3/3] END ..................n_estimators=100;, score=0.933 total time= 2.1min
[CV 1/3] END ..................n_estimators=150;, score=0.935 total time= 3.1min
[CV 2/3] END ..................n_estimators=150;, score=0.927 total time= 3.1min
[CV 3/3] END ..................n_estimators=150;, score=0.933 total time= 3.1min


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Uploading /local_disk0/repl_tmp_data/ReplId-33b2f-71652-d5087-c/tmp6mkjpnv1/model/model.pkl:   0%|          | …

Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Uploading /local_disk0/repl_tmp_data/ReplId-33b2f-71652-d5087-c/tmp42kdv0g0/model/model.pkl:   0%|          | …

[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-3461394460691664>, line 21[0m
[1;32m     19[0m best_params [38;5;241m=[39m GS[38;5;241m.[39mbest_params_ [38;5;66;03m# To get only the best hyperparameter values that we searched for[39;00m
[1;32m     20[0m n_est [38;5;241m=[39m best_params[38;5;241m.[39mvalues()
[0;32m---> 21[0m [38;5;28mprint[39m([38;5;124m'[39m[38;5;124mn_estimator[39m[38;5;124m'[39m[38;5;241m.[39mn_est)
[1;32m     23[0m best_score [38;5;241m=[39m GS[38;5;241m.[39mbest_score_ [38;5;66;03m# score according to the metric we passes in refit[39;00m
[1;32m     24[0m [38;5;28mprint[39m([38;5;124m'[39m[38;5;124mR2 Score :[39m[38;5;124m'[39m,best_score)

[0;31mAttributeError[0m: 'str' object has no attribute 'n_est'

In [None]:
n_estimators = 100
model = RandomForestRegressor(n_estimators = n_estimators, random_state = 42)
model.fit(x_train_encoded,y_train)

y_train_pred = model.predict(x_train_encoded)
y_test_pred = model.predict(x_test_encoded)

rmse_tr, mae_tr, r2_tr = evaluation_metrics(y_train,y_train_pred)
print(f"Trained Data Metrics - RMSE : {rmse_tr} | MAE : {mae_tr} | r2_score : {r2_tr}")

rmse_tt, mae_tt, r2_tt = evaluation_metrics(y_test,y_test_pred)
print(f"Predicted Data Metrics - RMSE : {rmse_tt} | MAE : {mae_tt} | r2_score : {r2_tt}")

Trained Data Metrics - RMSE : 2076.5978566488047 | MAE : 708.1864164631615 | r2_score : 0.9915950036666051
Predicted Data Metrics - RMSE : 6197.813142295127 | MAE : 1923.0415724897402 | r2_score : 0.9270752003057658




Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Uploading /local_disk0/repl_tmp_data/ReplId-33b2f-71652-d5087-c/tmps3qfa2f4/model/model.pkl:   0%|          | …



Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# User predictions

import pandas as pd

user_input = pd.DataFrame([[2012,1,'A',1,86.11,3.417,221.949864,6.908]], columns = ['Year','Store','Store_Type','Dept','Temperature','Fuel_Price','CPI','Unemployment'])

user_input_transformed = transformer.transform(user_input)

result = model.predict(user_input_transformed)
print('The predicted value is :',result)

The predicted value is : [16354.9561]
