In [1]:
import pandas as pd
import numpy as np

In [2]:
# here df we will be using which is already encoded during explaining EDA and feature engineering part , in EDA file
df=pd.read_csv('train_df.csv')
df.head(1)

Unnamed: 0,category,main_promotion,color,stars,success_indicator
0,5,0,3,1.0,0


In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  # as distance based algorithem is there
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier

In [4]:
import warnings
warnings.filterwarnings('ignore')  #to do away with unwanted or unuseful warnings which makes code lengthy 

# Model_selection_Pipeline 

In [5]:

data = df
x=df.drop('success_indicator',axis=1)
y=df['success_indicator']


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=77)

pipeline = Pipeline([('scaler', StandardScaler()),('classifier', None) ])

# Adding models to the param_grids list so to compare their accuracy with same data but different hyperparamere
param_grids = [
    {'classifier': [LogisticRegression()],    
     'classifier__C': [0.1, 1, 10]},
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [100, 200, 300]},  #number of base model used to predict output
    {'classifier': [MLPClassifier()],
     'classifier__hidden_layer_sizes': [(100,), (50, 50), (25, 25, 25)],
     'classifier__activation': ['relu', 'tanh'],  # these will be our activation function
     'classifier__solver': ['adam']}              # adam will be optimiazer for ANN
]

best_model = None
best_score = 0

for entity in param_grids:
    gs = GridSearchCV(pipeline, entity, cv=5, scoring='accuracy')  # cross validation in 5 , while measure of comparison 
    gs.fit(x_train, y_train)                                       # is acccuracy , we may use random search cv also if 
                                                                    # data is more

    y_pred = gs.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Best Parameters:", gs.best_params_)
    print("Best Score:", gs.best_score_)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("***************************************************")

    if gs.best_score_ > best_score:
        best_model = gs.best_estimator_
        best_score = gs.best_score_

print("Best Model:", best_model)
print("Best Score:", best_score)


Best Parameters: {'classifier': LogisticRegression(C=0.1), 'classifier__C': 0.1}
Best Score: 0.7579559467526525
Accuracy: 0.7512054001928641
Precision: 0.7098919368246052
Recall: 0.8364348677766895
F1 Score: 0.7679856115107914
***************************************************
Best Parameters: {'classifier': RandomForestClassifier(n_estimators=200), 'classifier__n_estimators': 200}
Best Score: 0.8294354270608656
Accuracy: 0.8278688524590164
Precision: 0.7938053097345132
Recall: 0.8785504407443683
F1 Score: 0.8340306834030683
***************************************************
Best Parameters: {'classifier': MLPClassifier(hidden_layer_sizes=(50, 50)), 'classifier__activation': 'relu', 'classifier__hidden_layer_sizes': (50, 50), 'classifier__solver': 'adam'}
Best Score: 0.8105107591305547
Accuracy: 0.8211186113789778
Precision: 0.7850877192982456
Recall: 0.8765915768854065
F1 Score: 0.828320222119389
***************************************************
Best Model: Pipeline(steps=[('scale

# Random forest classifier will be the model which we will be choosing

# Reasons


1.Accuracy perspective : Among majority of the models which we have used for modelling the data we getting high accuracy on random forest classifier.

2.Principle of working of random-forest classifier: Random Forest is an ensemble learning method that combines multiple decision trees to make predictions. And as we are taking decision from multiple models problem of overfitting reduces along with increase in accuracy.

3.Robustness to outliers: As Random forest is ruled based algorithem unlike logistic regression which is distance based it is generally robust to outliers

4.Easy to interprete and understand : Random forest is based on wisdom of group principle and hence easy to understand along with ease in interpretation of hyperparametre for tuning model.

These are the main 4 reason which we should take into consideration while choosing Random-Forest classifier as our model for training data

