<a href="https://colab.research.google.com/github/Yogi-Puvvala/Machine_Learning/blob/main/Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Random Forest (Classifier)**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv("drive/MyDrive/Colab_Projects/bank.csv")

In [None]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [None]:
df["education"].value_counts()

Unnamed: 0_level_0,count
education,Unnamed: 1_level_1
secondary,5476
tertiary,3689
primary,1500
unknown,497


In [None]:
df["job"].value_counts()

Unnamed: 0_level_0,count
job,Unnamed: 1_level_1
management,2566
blue-collar,1944
technician,1823
admin.,1334
services,923
retired,778
self-employed,405
student,360
unemployed,357
entrepreneur,328


In [None]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'deposit'],
      dtype='object')

In [None]:
df.isna().sum()

Unnamed: 0,0
age,0
job,0
marital,0
education,0
default,0
balance,0
housing,0
loan,0
contact,0
day,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


In [None]:
X = df.drop("deposit", axis = 1)
y = df["deposit"]

In [None]:
numerical_cols = [col for col in X.columns if (df[col].dtype == "int64" or df[col].dtype == "float64")]
categorical_cols = [col for col in X.columns if df[col].dtype == "O" and col != "education" and col != "job"]
ordinal_cols = ["education", "job"]
order = [["unknown", "primary", "secondary", "tertiary"], ["management", "technician", "admin.", "services", "entrepreneur", "self-employed", "blue-collar", "housemaid", "student", "unemployed", "retired", "unknown"]]

In [None]:
preprocessor = ColumnTransformer([
    ("num", "passthrough", numerical_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
    ("ord", OrdinalEncoder(categories=order), ordinal_cols)
])

In [None]:
rfc = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier())
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
print("Training Score:", rfc.score(X_train, y_train))
print("Testing Score:", rfc.score(X_test, y_test))

Training Score: 1.0
Testing Score: 0.8365427675772503


In [None]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['sqrt', 'log2']
}

gscv_rfc = GridSearchCV(rfc, param_grid, cv = 5, n_jobs=-1, scoring="accuracy", verbose=2)
gscv_rfc.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [None]:
print("Training Score:", gscv_rfc.score(X_train, y_train))
print("Testing Score:", gscv_rfc.score(X_test, y_test))

Training Score: 0.9501623922051742
Testing Score: 0.8369905956112853


In [None]:
gscv_rfc_pred = gscv_rfc.predict(X_test)
print(classification_report(y_test, gscv_rfc_pred))

              precision    recall  f1-score   support

          no       0.87      0.81      0.84      1166
         yes       0.81      0.86      0.83      1067

    accuracy                           0.84      2233
   macro avg       0.84      0.84      0.84      2233
weighted avg       0.84      0.84      0.84      2233



# **Random Forest (Regressor)**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

In [None]:
df = pd.read_csv("drive/MyDrive/Colab_Projects/CarPrice_Assignment.csv")

In [None]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [None]:
df.shape

(205, 26)

In [None]:
df.isna().sum()

Unnamed: 0,0
car_ID,0
symboling,0
CarName,0
fueltype,0
aspiration,0
doornumber,0
carbody,0
drivewheel,0
enginelocation,0
wheelbase,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [None]:
X = df.drop("price", axis = 1)
y = df["price"]

In [None]:
numerical_cols = [col for col in X.columns if (df[col].dtype == "int64" or df[col].dtype == "float64")]
categorical_cols = [col for col in X.columns if df[col].dtype == "O"]

In [None]:
preprocessor = ColumnTransformer([
    ("num", "passthrough", numerical_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
])

In [None]:
rfr = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor())
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [None]:
rfr.fit(X_train, y_train)

In [None]:
print("Training Score:", rfr.score(X_train, y_train))
print("Testing Score:", rfr.score(X_test, y_test))

Training Score: 0.9907323893539272
Testing Score: 0.8081172519681556


In [None]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None]
}

gscv_rfr = GridSearchCV(rfr, param_grid, cv = 5, n_jobs=-1, verbose=2)
gscv_rfr.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




In [None]:
print("r2_score:", r2_score(y_test, gscv_rfr.predict(X_test)))

r2_score: 0.831528710134491
