In [3]:
import pandas as pd
import numpy as np

In [4]:
# here df we will be using which is already encoded during explaining EDA and feature engineering part , in EDA file
df=pd.read_csv('train_df.csv')
df.head(1)

Unnamed: 0,category,main_promotion,color,stars,success_indicator
0,5,0,3,1.0,0


In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler  # as distance based algorithem is there
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neural_network import MLPClassifier

In [6]:
import warnings
warnings.filterwarnings('ignore')  #to do away with unwanted or unuseful warnings which makes code lengthy

# Model_selection_Pipeline

In [7]:

data = df
x=df.drop('success_indicator',axis=1)
y=df['success_indicator']


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=77)

pipeline = Pipeline([('scaler', StandardScaler()),('classifier', None) ])

# Adding models to the param_grids list so to compare their accuracy with same data but different hyperparamere
param_grids = [
    {'classifier': [LogisticRegression()],
     'classifier__C': [0.1, 1, 10]},
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [100, 200, 300]},  #number of base model used to predict output
    {'classifier': [MLPClassifier()],
     'classifier__hidden_layer_sizes': [(100,), (50, 50), (25, 25, 25)],
     'classifier__activation': ['relu', 'tanh'],  # these will be our activation function
     'classifier__solver': ['adam']}              # adam will be optimiazer for ANN
]

best_model = None
best_score = 0

for entity in param_grids:
    gs = GridSearchCV(pipeline, entity, cv=5, scoring='accuracy')  # cross validation in 5 , while measure of comparison
    gs.fit(x_train, y_train)                                       # is acccuracy , we may use random search cv also if
                                                                    # data is more

    y_pred = gs.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Best Parameters:", gs.best_params_)
    print("Best Score:", gs.best_score_)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("***************************************************")

    if gs.best_score_ > best_score:
        best_model = gs.best_estimator_
        best_score = gs.best_score_

print("Best Model:", best_model)
print("Best Score:", best_score)


Best Parameters: {'classifier': LogisticRegression(C=0.1), 'classifier__C': 0.1}
Best Score: 0.7579559467526525
Accuracy: 0.7512054001928641
Precision: 0.7098919368246052
Recall: 0.8364348677766895
F1 Score: 0.7679856115107914
***************************************************
Best Parameters: {'classifier': RandomForestClassifier(n_estimators=200), 'classifier__n_estimators': 200}
Best Score: 0.8294356449305358
Accuracy: 0.8293153326904532
Precision: 0.7964444444444444
Recall: 0.8775710088148874
F1 Score: 0.8350419384902144
***************************************************
Best Parameters: {'classifier': MLPClassifier(activation='tanh', hidden_layer_sizes=(25, 25, 25)), 'classifier__activation': 'tanh', 'classifier__hidden_layer_sizes': (25, 25, 25), 'classifier__solver': 'adam'}
Best Score: 0.8120777504230302
Accuracy: 0.8288331726133076
Precision: 0.8
Recall: 0.8697355533790402
F1 Score: 0.8334115438761145
***************************************************
Best Model: Pipeline(s

# Random forest classifier will be the model we select.


# why?



1. Accuracy perspective: We obtain good accuracy on the random forest classifier using most of the models we have used to model the data.

2. The Random Forest classifier's operating principle: Random Forest is an ensemble learning technique that combines several decision trees to generate predictions. Additionally, as we make decisions based on a variety of models, the overfitting issue gets less severe and accuracy rises.

3. Robustness to outliers: Unlike logistic regression, which is based on distance, which is rule-based, random forest is typically robust to outliers.

4. straightforward to interpret and comprehend: Because random forests are founded on the wisdom of group concept, they are simple to comprehend and the hyperparameters needed to tune the model are straightforward to read.

These are the main 4 reason which we should take into consideration while choosing Random-Forest classifier as our model for training data