In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score, f1_score
import warnings
warnings.filterwarnings("ignore")

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [5]:
print(train.shape)
print(test.shape)

(6922, 80)
(2308, 80)


In [6]:
train.drop("Unnamed: 0", axis = 1, inplace = True)
test.drop("Unnamed: 0", axis = 1, inplace = True)

In [7]:
train.sample(frac = 0.1).reset_index().drop("index", axis = 1, inplace = True)
test.sample(frac = 0.1).reset_index().drop("index", axis = 1, inplace = True)

In [8]:
x_train = train.drop("Converted", axis = 1)
y_train = train.iloc[:, -1]
x_test = test.drop("Converted", axis = 1)
y_test = test.iloc[:, -1]

In [9]:
pd.set_option('display.max_columns', 500)

In [10]:
x_train.head()

Unnamed: 0,Tags_Will revert after reading the email,Total Time Spent on Website,Lead Profile_Potential Lead,Last Notable Activity_SMS Sent,Lead Origin_Lead Add Form,Tags_Interested in other courses,Last Activity_SMS Sent,Tags_Ringing,What is your current occupation_Working Professional,Lead Source_Reference,Lead Profile_Unspecified,What is your current occupation_Unemployed,Tags_Closed by Horizzon,Last Notable Activity_Modified,Last Activity_Olark Chat Conversation,Lead Source_Welingak Website,Specialization_Unspecified,Lead Source_Olark Chat,Tags_Still Thinking,Last Activity_Converted to Lead,Tags_switched off,Last Notable Activity_View in browser link Clicked,Tags_Interested in Next batch,Last Notable Activity_Email Link Clicked,Tags_Interested in full time MBA,Tags_Lateral student,Last Activity_Page Visited on Website,Last Notable Activity_Olark Chat Conversation,Last Activity_Email Bounced,City_Tier II Cities,Tags_opp hangup,Tags_Recognition issue (DEC approval),Tags_Lost to EINS,Lead Profile_Student of SomeSchool,Lead Source_Social Media,Do Not Email_Yes,Last Activity_Email Marked Spam,Lead Source_WeLearn,Lead Profile_Other Leads,Specialization_Finance Management,Specialization_E-COMMERCE,City_Other Cities of Maharashtra,Tags_Shall take in the next coming month,Tags_Lost to Others,TotalVisits,Specialization_Hospitality Management,How did you hear about X Education_Word Of Mouth,Specialization_Retail Management,How did you hear about X Education_Student of SomeSchool,Lead Source_Press_Release,City_Thane & Outskirts,Tags_Graduation in progress,Tags_In confusion whether part time or DLP,How did you hear about X Education_Unspecified,Lead Source_Pay per Click Ads,Tags_Diploma holder (Not Eligible),Tags_Not doing further education,Lead Origin_Lead Import,Lead Source_NC_EDM,Specialization_Operations Management,Specialization_Supply Chain Management,Last Notable Activity_Had a Phone Conversation,Lead Source_Live Chat,Lead Source_Organic Search,Tags_University not recognized,City_Unspecified,Specialization_Media and Advertising,Page Views Per Visit,A free copy of Mastering The Interview_Yes,Last Activity_Email Link Clicked,Last Notable Activity_Form Submitted on Website,Lead Source_Others,How did you hear about X Education_SMS,City_Other Cities,Last Notable Activity_Page Visited on Website,Tags_number not provided,Last Notable Activity_Email Marked Spam,Tags_invalid number
0,1,-0.737186,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.383861,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,-0.169195,1,0,0,0,0,0,0,0,0,0
1,0,0.200558,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.285678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.794993,0,0,0,0,0,0,0,0,0,0
2,0,1.972054,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6.016039,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.130628,1,0,0,0,0,0,0,0,0,0
3,1,-0.815635,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.383861,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.169195,0,0,0,0,0,0,0,0,0,0
4,0,-0.427037,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0.450908,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.530766,1,0,0,0,0,0,0,0,0,0


In [11]:
ct = ColumnTransformer([('se', StandardScaler(), ['Total Time Spent on Website', 'Page Views Per Visit', 'TotalVisits'])], remainder='passthrough')

In [13]:
random_forest_pipeline = Pipeline([('transformer', ct), ('RandomForest', RandomForestClassifier(random_state = 42))])

In [14]:
pipeline_list = [random_forest_pipeline]

In [15]:
pipe_dict = {0: "RandomForest"}

In [16]:
for idx, pipe in enumerate(pipeline_list):
    score = cross_val_score(pipe, x_train, y_train, cv = 10, scoring = 'accuracy')
    print(pipe_dict[idx], ":", score.mean())

RandomForest : 0.9063846558066212


Based on the above results, we will be choosing the **RandomForest Classifier, GradientBoosting, LightGBM & Catboost** on which we are going to test the other metrics to see in depth performance of these 4 models based on several different metrics to choose the best model for our analysis.

In [17]:
def evaluate_model(model, x_train, y_train, x_test, y_test):
    model = model.fit(x_train, y_train)
    predict_train_y = model.predict(x_train)
    predict_test_y = model.predict(x_test)

    print("**Accuracy Score**")
    train_accuracy = accuracy_score(y_train, predict_train_y)
    test_accuracy = accuracy_score(y_test, predict_test_y)
    print("Train Accuracy is: %s"%(train_accuracy))
    print("\nTest Accuracy is: %s"%(test_accuracy))
    print("---------------------------------------------------------")

    print("\n**Accuracy Error**")
    train_error = (1-train_accuracy)
    test_error = (1-test_accuracy)
    print("Train Error: %s"%(train_error))
    print("\nTest Error: %s"%(test_error))
    print("---------------------------------------------------------")

    print("\n**Classification Report**")
    train_cf_report = pd.DataFrame(classification_report(y_train, predict_train_y, output_dict = True))
    test_cf_report = pd.DataFrame(classification_report(y_test, predict_test_y, output_dict = True))
    print("Train Classification Report:")
    print(train_cf_report)
    print("\n Test Classification Report:")
    print(test_cf_report)
    print("---------------------------------------------------------")

    print("\n**Confusion Matrix**")
    train_conf = confusion_matrix(y_train, predict_train_y)
    test_conf = confusion_matrix(y_test, predict_test_y)
    print("Train Confusion Matrix Report:")
    print((train_conf))
    print("\n Test Confusion Matrix Report:")
    print((test_conf))

### RANDOM FOREST CLASSIFIER

In [18]:
rforest = RandomForestClassifier(random_state= 42)

In [19]:
evaluate_model(rforest, x_train, y_train, x_test, y_test)

**Accuracy Score**
Train Accuracy is: 0.985408841375325

Test Accuracy is: 0.9137781629116117
---------------------------------------------------------

**Accuracy Error**
Train Error: 0.014591158624674971

Test Error: 0.08622183708838826
---------------------------------------------------------

**Classification Report**
Train Classification Report:
                     0            1  accuracy    macro avg  weighted avg
precision     0.980623     0.993429  0.985409     0.987026      0.985533
recall        0.996017     0.968350  0.985409     0.982183      0.985409
f1-score      0.988260     0.980729  0.985409     0.984494      0.985372
support    4268.000000  2654.000000  0.985409  6922.000000   6922.000000

 Test Classification Report:
                     0           1  accuracy    macro avg  weighted avg
precision     0.910143    0.920143  0.913778     0.915143      0.914060
recall        0.952279    0.853982  0.913778     0.903131      0.913778
f1-score      0.930734    0.885829  

### Random Forest Hyperparameter Tuning

In [20]:
new_pipeline = Pipeline([('transformer', ct), ('classifier', RandomForestClassifier(random_state=42))])

In [21]:
rf_params = [{
                'classifier': [RandomForestClassifier()],
                'classifier__n_estimators': np.arange(100,2000, 200),
                'classifier__max_depth': [None, 10, 20, 30, 50, 70, 80, 100],
                'classifier__min_samples_split': [2, 3, 5, 7, 10],
                'classifier__min_samples_leaf': [1,2,3,4,5,],
                'classifier__max_features': ['auto', 'sqrt', 'log2'],
                'classifier__bootstrap': [True, False]
               }]

In [25]:
random_search = RandomizedSearchCV(estimator = new_pipeline, param_distributions = rf_params, scoring = 'accuracy', n_jobs = -1, cv = 10, random_state = 42)

In [29]:
best_rf_model = random_search.fit(x_train, y_train)

In [None]:
best_rf_model.best_params_

{'classifier__n_estimators': 300,
 'classifier__min_samples_split': 10,
 'classifier__min_samples_leaf': 2,
 'classifier__max_features': 'auto',
 'classifier__max_depth': None,
 'classifier__bootstrap': False,
 'classifier': RandomForestClassifier(bootstrap=False, min_samples_leaf=2,
                        min_samples_split=10, n_estimators=300)}

In [30]:
print("Best Score: %s" %(best_rf_model.best_score_))

Best Score: 0.9082618088398435


In [31]:
best_rf_model.best_estimator_

In [36]:
rf_classif_pipeline = Pipeline([('transformer', ct), ('RandomForest', RandomForestClassifier(n_estimators = 300, min_samples_split = 10, min_samples_leaf = 2, bootstrap = False, max_depth = None, random_state = 42))])

In [37]:
rf_classif_pipeline.fit(x_train, y_train)

In [38]:
test_prediction = rf_classif_pipeline.predict(x_test)

In [39]:
accuracy_score(y_test, test_prediction)

0.9202772963604853

In [40]:
def check_metric(y_test, y_predict):

    print("**Accuracy Score**")
    test_accuracy = accuracy_score(y_test, y_predict)
    print("\nTest Accuracy is: %s"%(test_accuracy))
    print("---------------------------------------------------------")

    print("\n**Accuracy Error**")
    test_error = (1-test_accuracy)
    print("\nTest Error: %s"%(test_error))
    print("---------------------------------------------------------")

    print("\n**Classification Report**")
    test_cf_report = pd.DataFrame(classification_report(y_test, y_predict, output_dict = True))
    print("\n Test Classification Report:")
    print(test_cf_report)
    print("---------------------------------------------------------")

    print("\n**Confusion Matrix**")
    test_conf = confusion_matrix(y_test, y_predict)
    print("\n Test Confusion Matrix Report:")
    print((test_conf))

In [41]:
check_metric(y_test, test_prediction)

**Accuracy Score**

Test Accuracy is: 0.9202772963604853
---------------------------------------------------------

**Accuracy Error**

Test Error: 0.07972270363951472
---------------------------------------------------------

**Classification Report**

 Test Classification Report:
                     0           1  accuracy    macro avg  weighted avg
precision     0.915531    0.928571  0.920277     0.922051      0.920639
recall        0.957265    0.862832  0.920277     0.910048      0.920277
f1-score      0.935933    0.894495  0.920277     0.915214      0.919703
support    1404.000000  904.000000  0.920277  2308.000000   2308.000000
---------------------------------------------------------

**Confusion Matrix**

 Test Confusion Matrix Report:
[[1344   60]
 [ 124  780]]


## Final Model

In [44]:
random_forest = Pipeline([('transformer', ct), ('RandomForest', RandomForestClassifier(n_estimators = 300, min_samples_split = 10, min_samples_leaf = 2, bootstrap = False, max_depth = None, random_state = 42))])

In [45]:
random_forest.fit(x_train, y_train)

In [46]:
y_train_predict = random_forest.predict(x_train)

In [47]:
y_test_predict = random_forest.predict(x_test)

In [48]:
print("Train Accuracy: %s"%(accuracy_score(y_train, y_train_predict)))
print("Test Accuracy: %s"%(accuracy_score(y_test, y_test_predict)))

Train Accuracy: 0.9436579023403641
Test Accuracy: 0.9202772963604853


In [49]:
x_train.loc[:, "Actual Class"] = y_train
x_train.loc[:, "Predicted Class"] = y_train_predict
x_test.loc[:, "Actual Class"] = y_test
x_test.loc[:, "Predicted Class"] = y_test_predict

In [53]:
predicted_df = pd.concat([x_train,x_test])

In [55]:
import pickle

# Save the trained pipeline as a .pkl file
with open("random_forest.pkl", "wb") as f:
    pickle.dump(random_forest_pipeline, f)


In [56]:
from google.colab import files
files.download("random_forest.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>