# Training

## Importing Libraries

In [113]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
import pickle

## Loading Processed Dataset

In [114]:
df = pd.read_csv('G:\Ashish Yadav\Backup Google Drive\Documents-Lecture\Github\ML-Mini-Project\data\processed\processed.csv')
df.sample(5)

Unnamed: 0,Make,Model,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Engine,Drivetrain,Price
345,Maruti Suzuki,Alto LXi BS-III,2011,87513,Petrol,Manual,Salem,Blue,Second,796.0,FWD,227000
664,Hyundai,Creta SX 1.6 (O) Petrol,2018,37000,Petrol,Manual,Delhi,Black,Second,1591.0,FWD,1115000
1283,Maruti Suzuki,Ertiga ZXi AT,2019,21894,Petrol,Automatic,Delhi,White,First,1462.0,FWD,990000
680,Tata,Tiago Revotron XTA [2017-2019],2019,12000,Petrol,Automatic,Mumbai,Silver,First,1199.0,FWD,585000
450,Maruti Suzuki,Vitara Brezza ZDi Plus,2019,90000,Diesel,Manual,Hyderabad,White,First,1248.0,FWD,1025000


In [115]:
df.shape

(1624, 12)

In [116]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

## Train test split

In [117]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=8)

## One hot encoding and model creation

In [118]:
model = RandomForestRegressor(n_estimators=22,min_samples_split=15,max_depth=15,random_state=8)
encoding = ('encoding',OneHotEncoder(sparse_output=False,drop='first',handle_unknown='ignore'),[0,1,4,5,6,7,8,10])

## Training using Random Forest Regressor

In [119]:
preprocessing = ColumnTransformer(
    transformers=[encoding],
    remainder='passthrough'
)
pipe = Pipeline([
    ('preprocessing',preprocessing),
    ('model',model)
])
pipe.fit(X_train,y_train)

## Evaluating the Model

In [120]:
print(f'Train Score: {round(pipe.score(X_train,y_train)*100,2)}%')
print(f'Test Score: {round(pipe.score(X_test,y_test)*100,2)}%')
print(f'R2 Score: {round(r2_score(y_test,y_pred)*100,2)}%')

Train Score: 94.93%
Test Score: 82.87%
R2 Score: 83.25%




## Making prediction

In [121]:
y_pred = pipe.predict(X_test)



## Cross Validation

In [122]:
cv_dict = cross_validate(pipe,X_train,y_train,cv=5,return_train_score=True)
cv_dict



{'fit_time': array([2.28000236, 2.22800612, 2.20499039, 2.15000701, 2.46796417]),
 'score_time': array([0.07300401, 0.06699443, 0.06202269, 0.0639987 , 0.07101679]),
 'test_score': array([0.85564288, 0.88236263, 0.85364072, 0.82974879, 0.85719917]),
 'train_score': array([0.94452162, 0.9401311 , 0.94336452, 0.94738819, 0.94547064])}

In [123]:
print(f"Cross validation train score mean: {round(cv_dict['train_score'].mean()*100,2)}%")
print(f"Cross validation test score mean: {round(cv_dict['test_score'].mean()*100,2)}%")

Cross validation train score mean: 94.42%
Cross validation test score mean: 85.57%


## Saving the model

In [124]:
with open('G:\Ashish Yadav\Backup Google Drive\Documents-Lecture\Github\ML-Mini-Project\data\\trained_models\\rfr_model.pkl','wb') as file:
  pickle.dump(pipe,file)