# Training

## Importing Libraries

In [57]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
import random
import pickle

## Loading Processed Dataset

In [58]:
df = pd.read_csv('G:\Ashish Yadav\Backup Google Drive\Documents-Lecture\Github\ML-Mini-Project\data\processed\processed.csv')
df.sample(5)

Unnamed: 0,Make,Model,Year,Kilometer,Fuel Type,Transmission,Location,Owner,Engine,Drivetrain,Price
1304,Maruti Suzuki,DZire VXi,2020,42000,Petrol,Manual,Lucknow,First,1197.0,FWD,650000
1227,BMW,X1 sDrive20d xLine,2019,59000,Diesel,Automatic,Mumbai,First,1995.0,RWD,3025000
1361,Maruti Suzuki,Celerio X Zxi AMT [2017-2019],2018,44000,Petrol,Automatic,Delhi,First,998.0,FWD,484999
508,BMW,5-Series 520d Luxury Line,2015,70000,Diesel,Automatic,Coimbatore,Second,1995.0,RWD,2550000
1524,Maruti Suzuki,Ciaz Delta Hybrid 1.5 [2018-2020],2019,6200,Petrol,Manual,Mumbai,First,1462.0,FWD,750000


In [59]:
df.shape

(1624, 11)

In [60]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [61]:
categorical_indices = [ind for ind,elem in enumerate(df.iloc[0,:-1].values) if isinstance(elem,str)]

## Train test split

In [62]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=8)

## One hot encoding and model creation

In [101]:
model = RandomForestRegressor(n_estimators=22,min_samples_split=9,max_depth=16,random_state=83)
encoding = ('encoding',OneHotEncoder(sparse_output=False,drop='first',handle_unknown='ignore'),categorical_indices)

## Training using Random Forest Regressor

In [102]:
preprocessing = ColumnTransformer(
    transformers=[encoding],
    remainder='passthrough'
)
pipe = Pipeline([
    ('preprocessing',preprocessing),
    ('model',model)
])
pipe.fit(X_train,y_train)

## Making prediction

In [103]:
y_pred = pipe.predict(X_test)



## Evaluating the Model

In [104]:
print(f'Train Score: {round(pipe.score(X_train,y_train)*100,2)}%')
print(f'Test Score: {round(pipe.score(X_test,y_test)*100,2)}%')
print(f'R2 Score: {round(r2_score(y_test,y_pred)*100,2)}%')

Train Score: 96.63%
Test Score: 84.58%
R2 Score: 84.58%




## Cross Validation

In [109]:
cv_dict = cross_validate(pipe,X_train,y_train,cv=5,return_train_score=True)
cv_dict



{'fit_time': array([2.49999857, 2.37799883, 2.34599972, 2.29098678, 2.31101012]),
 'score_time': array([0.06400681, 0.06500506, 0.06401944, 0.06400967, 0.06398988]),
 'test_score': array([0.87878368, 0.9051024 , 0.88598558, 0.85839884, 0.82445555]),
 'train_score': array([0.9630239 , 0.96550113, 0.96512432, 0.96375906, 0.96275151])}

In [106]:
print(f"Cross validation train score mean: {round(cv_dict['train_score'].mean()*100,2)}%")
print(f"Cross validation test score mean: {round(cv_dict['test_score'].mean()*100,2)}%")

Cross validation train score mean: 96.4%
Cross validation test score mean: 87.05%


## Hyper parameter tuning

In [69]:
from sklearn.model_selection import RandomizedSearchCV

In [74]:
params = {
    'model__n_estimators': [random.randint(5,250) for _ in range(10)],
    'model__min_samples_split': [random.randint(5,50) for _ in range(10)],
    'model__max_depth':[random.randint(5,50) for _ in range(10)],
    'model__random_state':[random.randint(5,250) for _ in range(10)],
    'preprocessing__encoding__sparse_output': [False],
    'preprocessing__encoding__drop': ['first'],
    'preprocessing__encoding__handle_unknown': ['ignore']    
}

In [None]:
rscv = RandomizedSearchCV(pipe,param_distributions=params,cv=7,verbose=0)
rscv.fit(X_train,y_train)

In [108]:
rscv.best_params_

{'preprocessing__encoding__sparse_output': False,
 'preprocessing__encoding__handle_unknown': 'ignore',
 'preprocessing__encoding__drop': 'first',
 'model__random_state': 83,
 'model__n_estimators': 234,
 'model__min_samples_split': 9,
 'model__max_depth': 16}

## Saving the model

In [107]:
with open('G:\Ashish Yadav\Backup Google Drive\Documents-Lecture\Github\ML-Mini-Project\data\\trained_models\\rfr_model.pkl','wb') as file:
  pickle.dump(pipe,file)