<a href="https://colab.research.google.com/github/abhranil-datascience/TATASteelPOC/blob/master/TataSteelPOC_4HrsChallenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
######################### Suppress Warnings #######################################

#import warnings
#warnings.filterwarnings('ignore')

############################## Mount Drive ########################################

from google.colab import drive
drive.mount('/content/gdrive')

############################## Change Directory ###################################

import os
os.chdir('/content/gdrive/My Drive/TCS/TATASteelInductionPOC/')

################# Root Import Statements #####################

import pandas as pd
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import operator
from sklearn.model_selection import KFold
from sklearn import metrics as mt
import statistics as st

########################## Steps ############################# 

##1. Import Dataset
dataframe=pd.read_excel('TATASteelDataset.xlsm', 'Dataset')

##2. Understand Dataset features
pandas_profiling.ProfileReport(dataframe)

""" Profile report shows 2 warnings: 
a. Y has 134 zero values (Confirmed By Nitish that 0 is invalid value)"""
dataframe=dataframe[dataframe.Y != 0]
"""
b. X11 has 609 zero values (This is ok)
c. X2 is categorical so needs to be one  hot encoded"""
dataframe[dataframe.X2 == "REVERSAL"]=1

"""
e. X1 is boolean
f. Rest 20 columns (ID column excluded) is numerical"""

## 3. Check Correlation with Target Variable
corr = dataframe.corr()
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns)
plt.show()
"""
Based on heat map X1 and X20 have very little impact on Y so removing this column
"""
drop_list=['X1','X20']
dataframe=dataframe.drop(drop_list, axis=1)

## 4. Convert to Numpy Array
X=dataframe.iloc[:,1:20].values
Y=dataframe.iloc[:,20].values

## 5. Scale Dataset
XScaler=StandardScaler()
YScaler=StandardScaler()
XScaled=XScaler.fit_transform(X)
YScaled=YScaler.fit_transform(Y.reshape(-1,1))

## 6. Train Test Split
XTrain,XTest,YTrain,YTest=train_test_split(XScaled,YScaled,test_size=0.1,random_state=42)

## 7. Check if dataset is linear or non linear
RegressorSVR=SVR()
parameters=[{'kernel':['linear','rbf','poly','sigmoid']}]
GS=GridSearchCV(estimator=RegressorSVR,param_grid=parameters,scoring='neg_mean_squared_error',cv=9)
GS.fit(XTrain,YTrain)
best_params=GS.best_params_#linear


## 8. Verify Feature Dependencies
RegressorRF=RandomForestRegressor()
RegressorRF.fit(XTrain,YTrain)
imp=RegressorRF.feature_importances_
Importances={}
for indexes in range(0,len(imp)):
  Importances[indexes]=imp[indexes]
sorted_x = sorted(Importances.items(), key=operator.itemgetter(1))
"""
Based on the above feature importances we need to drop column number 4,8,9,10,13,18 as their score is  < 0.01
"""
XTrain=XTrain[:,[0,1,2,3,5,6,7,11,12,14,15,16,17]]
XTest=XTest[:,[0,1,2,3,5,6,7,11,12,14,15,16,17]]

## 9. Hypertune RandomForest Regressor
RegressorHyp=RandomForestRegressor()
parametersRF=[{'n_estimators':[5,6,7,8,9,10,11,12,13,14,15]}]
GS=GridSearchCV(estimator=RegressorHyp,param_grid=parametersRF,scoring='r2',cv=10)
GS.fit(XTrain,YTrain)
best_params_RF=GS.best_params_#n_estimators=13

""" We can do more hypertuning of parameter but due to lack of time i am limiting this to only n_estimators"""

## 10. Create Regressor and ApplyKFold
regressor=RandomForestRegressor(n_estimators=13)
r2=[]
kfold=KFold(n_splits=9,shuffle=True)
for count in range(0,100):
  for train_idx,val_idx in kfold.split(XTrain):
    XTrainKF_Train,XTrainKF_Val=XTrain[train_idx],XTrain[val_idx]
    YTrainKF_Train,YTrainKF_Val=YTrain[train_idx],YTrain[val_idx]
    regressor.fit(XTrainKF_Train,YTrainKF_Train)
    YPred=regressor.predict(XTrainKF_Val)
    r2.append(mt.r2_score(YTrainKF_Val,YPred))
print("Goodness Of Fit in Training Set: "+str(st.mean(r2)))#Goodness Of Fit: 0.8244056285284584

## 11. Predict Test Data
FinalRegressor=regressor.fit(XTrain,YTrain)
TestPred=FinalRegressor.predict(XTest)
FinalPrediction=YScaler.inverse_transform(TestPred)
GOF=mt.r2_score(YScaler.inverse_transform(YTest),FinalPrediction)
print("Goodness Of Fit in Test Set: "+str(GOF))#Goodness Of Fit: 0.841222269015523

Goodness Of Fit in Test Set: 0.838557040754645
