<a href="https://colab.research.google.com/github/bhupeshmahara/smart-lead-scoring-engine/blob/main/Smart_Lead_Scoring_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [81]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# allow plots to appear directly in the notebook
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [82]:
train_df = pd.read_csv("https://raw.githubusercontent.com/bhupeshmahara/smart-lead-scoring-engine/main/train.csv")
test_df = pd.read_csv("https://raw.githubusercontent.com/bhupeshmahara/smart-lead-scoring-engine/main/test.csv")
submission_df = pd.read_csv("https://raw.githubusercontent.com/bhupeshmahara/smart-lead-scoring-engine/main/sample_submission.csv")

print(train_df.shape)
print(test_df.shape)
print(submission_df.shape)

(39161, 19)
(13184, 18)
(13184, 2)


In [83]:
train_df.head()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
0,1,2021-01-01,1,2,2.0,2020-09-24,0,0,0,0,0,0,1,1,0,0,0,0,0
1,2,2021-01-01,2,1,2.0,2020-09-19,1,0,1,0,0,0,1,0,0,0,0,0,0
2,3,2021-01-01,9,3,3.0,2021-08-11,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,2021-01-01,6,7,2.0,2017-10-04,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,2021-01-01,4,6,,2020-06-08,0,0,0,0,0,0,1,0,0,0,1,0,0


In [84]:
train_df.describe()

Unnamed: 0,id,campaign_var_1,campaign_var_2,products_purchased,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
count,39161.0,39161.0,39161.0,18250.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0
mean,19581.0,6.523812,6.452746,2.154137,0.400092,0.006716,0.102832,0.011465,0.151503,0.499834,0.286612,0.174434,0.01144,0.000383,0.218942,0.000562,0.05102
std,11304.951283,3.472944,2.614296,0.779815,0.509194,0.081676,0.303743,0.106463,0.359681,0.558166,0.455784,0.379689,0.106346,0.019568,0.431544,0.023696,0.220042
min,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9791.0,4.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,19581.0,6.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,29371.0,9.0,8.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,39161.0,16.0,15.0,4.0,3.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,1.0,1.0,4.0,1.0,1.0


In [85]:
# checking for missing data in the train dataset

train_df.isnull().sum()

id                          0
created_at                  0
campaign_var_1              0
campaign_var_2              0
products_purchased      20911
signup_date             15113
user_activity_var_1         0
user_activity_var_2         0
user_activity_var_3         0
user_activity_var_4         0
user_activity_var_5         0
user_activity_var_6         0
user_activity_var_7         0
user_activity_var_8         0
user_activity_var_9         0
user_activity_var_10        0
user_activity_var_11        0
user_activity_var_12        0
buy                         0
dtype: int64

here we can see that 53% and 38% of products_purchased and signup_date are having null values, and we can not drop these columns also, as these are crucial for classifying the target.

In [86]:
train_df['products_purchased'].mode()

0    2.0
dtype: float64

In [87]:
train_df['signup_date'].mode()

0    2021-07-18
dtype: object

In [88]:
train_df['products_purchased'].value_counts()

2.0    8867
3.0    5024
1.0    3643
4.0     716
Name: products_purchased, dtype: int64

In [89]:
# imputing "products_purchased" and "signup_date" with mode

train_df['products_purchased'] = train_df['products_purchased'].fillna(train_df['products_purchased'].mode()[0])
train_df['signup_date'] = train_df['signup_date'].fillna(train_df['signup_date'].mode()[0])

train_df.isnull().sum()

id                      0
created_at              0
campaign_var_1          0
campaign_var_2          0
products_purchased      0
signup_date             0
user_activity_var_1     0
user_activity_var_2     0
user_activity_var_3     0
user_activity_var_4     0
user_activity_var_5     0
user_activity_var_6     0
user_activity_var_7     0
user_activity_var_8     0
user_activity_var_9     0
user_activity_var_10    0
user_activity_var_11    0
user_activity_var_12    0
buy                     0
dtype: int64

In [90]:
train_df['products_purchased'].value_counts()

2.0    29778
3.0     5024
1.0     3643
4.0      716
Name: products_purchased, dtype: int64

In [91]:
# checking for duplicate data

train_df[train_df.duplicated()].count()

id                      0
created_at              0
campaign_var_1          0
campaign_var_2          0
products_purchased      0
signup_date             0
user_activity_var_1     0
user_activity_var_2     0
user_activity_var_3     0
user_activity_var_4     0
user_activity_var_5     0
user_activity_var_6     0
user_activity_var_7     0
user_activity_var_8     0
user_activity_var_9     0
user_activity_var_10    0
user_activity_var_11    0
user_activity_var_12    0
buy                     0
dtype: int64

In [92]:
train_df['created_year'] = pd.DatetimeIndex(train_df['created_at']).year
train_df['created_month'] = pd.DatetimeIndex(train_df['created_at']).month
train_df['created_day'] = pd.DatetimeIndex(train_df['created_at']).day

train_df['signup_year'] = pd.DatetimeIndex(train_df['signup_date']).year
train_df['signup_month'] = pd.DatetimeIndex(train_df['signup_date']).month
train_df['signup_day'] = pd.DatetimeIndex(train_df['signup_date']).day

train_df.head()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,...,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy,created_year,created_month,created_day,signup_year,signup_month,signup_day
0,1,2021-01-01,1,2,2.0,2020-09-24,0,0,0,0,...,0,0,0,0,2021,1,1,2020,9,24
1,2,2021-01-01,2,1,2.0,2020-09-19,1,0,1,0,...,0,0,0,0,2021,1,1,2020,9,19
2,3,2021-01-01,9,3,3.0,2021-08-11,1,0,0,0,...,0,0,0,0,2021,1,1,2021,8,11
3,4,2021-01-01,6,7,2.0,2017-10-04,0,0,0,0,...,0,0,0,0,2021,1,1,2017,10,4
4,5,2021-01-01,4,6,2.0,2020-06-08,0,0,0,0,...,0,1,0,0,2021,1,1,2020,6,8


In [93]:
train_df.columns

Index(['id', 'created_at', 'campaign_var_1', 'campaign_var_2',
       'products_purchased', 'signup_date', 'user_activity_var_1',
       'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_4',
       'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7',
       'user_activity_var_8', 'user_activity_var_9', 'user_activity_var_10',
       'user_activity_var_11', 'user_activity_var_12', 'buy', 'created_year',
       'created_month', 'created_day', 'signup_year', 'signup_month',
       'signup_day'],
      dtype='object')

In [94]:
# re-arranging columns

train = train_df[['id', 'created_at', 'created_year', 'created_month', 'created_day', 'campaign_var_1', 'campaign_var_2', 'products_purchased',
                  'signup_date', 'signup_year', 'signup_month', 'signup_day', 'user_activity_var_1', 'user_activity_var_2', 'user_activity_var_3',
                  'user_activity_var_4', 'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7', 'user_activity_var_8',
                  'user_activity_var_9', 'user_activity_var_10', 'user_activity_var_11', 'user_activity_var_12', 'buy']]

In [95]:
# drop columns 'created_at' and 'signup_date' since their data is extracted and they're not needed anymore

train.drop(['created_at', 'signup_date'], axis=1, inplace = True)

In [96]:
train.columns

Index(['id', 'created_year', 'created_month', 'created_day', 'campaign_var_1',
       'campaign_var_2', 'products_purchased', 'signup_year', 'signup_month',
       'signup_day', 'user_activity_var_1', 'user_activity_var_2',
       'user_activity_var_3', 'user_activity_var_4', 'user_activity_var_5',
       'user_activity_var_6', 'user_activity_var_7', 'user_activity_var_8',
       'user_activity_var_9', 'user_activity_var_10', 'user_activity_var_11',
       'user_activity_var_12', 'buy'],
      dtype='object')

In [97]:
train.head()

Unnamed: 0,id,created_year,created_month,created_day,campaign_var_1,campaign_var_2,products_purchased,signup_year,signup_month,signup_day,...,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
0,1,2021,1,1,1,2,2.0,2020,9,24,...,0,0,0,1,1,0,0,0,0,0
1,2,2021,1,1,2,1,2.0,2020,9,19,...,0,0,0,1,0,0,0,0,0,0
2,3,2021,1,1,9,3,3.0,2021,8,11,...,0,0,0,0,0,0,0,0,0,0
3,4,2021,1,1,6,7,2.0,2017,10,4,...,0,0,0,0,0,0,0,0,0,0
4,5,2021,1,1,4,6,2.0,2020,6,8,...,0,0,0,1,0,0,0,1,0,0


### __DATASET IS CLEAN NOW__

In [98]:
# train test split

X = train.loc[:,train.columns != 'buy']
y = train.buy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [99]:
print('Train cases as below')
print('X_train shape: ',X_train.shape)
print('y_train shape: ',y_train.shape)
print('\nTest cases as below')
print('X_test shape: ',X_test.shape)
print('y_test shape: ',y_test.shape)


Train cases as below
X_train shape:  (31328, 22)
y_train shape:  (31328,)

Test cases as below
X_test shape:  (7833, 22)
y_test shape:  (7833,)


In [100]:
# Standard Scaling before modelling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [105]:
models = [  LogisticRegression(),
            DecisionTreeClassifier(criterion='entropy', min_samples_leaf=22, min_samples_split=2, random_state=42, splitter='best'),
            RandomForestClassifier(criterion='entropy', max_features='auto', n_estimators=100, random_state=42),
            KNeighborsClassifier(n_neighbors=5, algorithm='auto', leaf_size=30, metric='manhattan', n_jobs=-1),
            GaussianNB(),
            GradientBoostingClassifier(n_estimators=200, criterion='friedman_mse', max_depth=8, learning_rate=0.1, random_state=42),
            XGBClassifier(base_score=0.5, booster='gbtree', learning_rate=0.1, max_depth=3, n_estimators=200, n_jobs=-1, objective='binary:logistic', random_state=42),
            SVC(decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf', random_state=42)
         ]

f1score = []
model_dict = {}
for model in models:
    model_dict[model.__class__.__name__] = model
    cv_results = cross_validate(model, X_train, y_train, cv=10, scoring='f1', return_train_score=True)
    f1score.append(np.mean(cv_results['test_score']))


In [106]:
f1score

[0.6567048375616109,
 0.6608054871025334,
 0.6722300209097416,
 0.6206838387465301,
 0.6084397340251034,
 0.6771780919087222,
 0.684501181367079,
 0.6745945356591905]

In [110]:
xgb = XGBClassifier(base_score=0.5, booster='gbtree', learning_rate=0.1, max_depth=3,
                    n_estimators=200, n_jobs=-1, objective='binary:logistic', random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)


In [111]:
xgb_f1 = f1_score(y_test, xgb_pred)*100
print('F1 Score is :', xgb_f1)


F1 Score is : 71.3224368499257


### __Predicting on Test Data__

In [112]:
df_test = test_df.copy()
df_test.head()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12
0,39162,2022-01-01,2,2,,2021-08-17,1,1,0,0,0,1,0,0,0,0,1,0
1,39163,2022-01-01,4,7,3.0,2020-05-21,1,0,0,0,0,0,0,1,0,0,0,0
2,39164,2022-01-01,8,7,,,0,0,0,0,1,1,0,0,0,0,0,0
3,39165,2022-01-01,9,8,2.0,2020-06-22,0,0,0,0,1,1,1,0,0,0,2,0
4,39166,2022-01-01,4,5,2.0,2021-03-10,1,0,0,0,0,0,0,0,0,0,0,0


In [113]:
# checking for missing data in the test dataset

df_test.isnull().sum()

id                         0
created_at                 0
campaign_var_1             0
campaign_var_2             0
products_purchased      8136
signup_date             6649
user_activity_var_1        0
user_activity_var_2        0
user_activity_var_3        0
user_activity_var_4        0
user_activity_var_5        0
user_activity_var_6        0
user_activity_var_7        0
user_activity_var_8        0
user_activity_var_9        0
user_activity_var_10       0
user_activity_var_11       0
user_activity_var_12       0
dtype: int64

In [114]:
df_test['products_purchased'].mode()
df_test['signup_date'].mode()

0    2022-03-06
dtype: object

In [115]:
df_test['products_purchased'].value_counts()

2.0    2484
3.0    1260
1.0    1106
4.0     198
Name: products_purchased, dtype: int64

In [116]:
# imputing "products_purchased" and "signup_date" with mode

df_test['products_purchased'] = df_test['products_purchased'].fillna(df_test['products_purchased'].mode()[0])
df_test['signup_date'] = df_test['signup_date'].fillna(df_test['signup_date'].mode()[0])

df_test.isnull().sum()

id                      0
created_at              0
campaign_var_1          0
campaign_var_2          0
products_purchased      0
signup_date             0
user_activity_var_1     0
user_activity_var_2     0
user_activity_var_3     0
user_activity_var_4     0
user_activity_var_5     0
user_activity_var_6     0
user_activity_var_7     0
user_activity_var_8     0
user_activity_var_9     0
user_activity_var_10    0
user_activity_var_11    0
user_activity_var_12    0
dtype: int64

In [117]:
df_test['products_purchased'].value_counts()

2.0    10620
3.0     1260
1.0     1106
4.0      198
Name: products_purchased, dtype: int64

In [118]:
df_test['created_year'] = pd.DatetimeIndex(df_test['created_at']).year
df_test['created_month'] = pd.DatetimeIndex(df_test['created_at']).month
df_test['created_day'] = pd.DatetimeIndex(df_test['created_at']).day

df_test['signup_year'] = pd.DatetimeIndex(df_test['signup_date']).year
df_test['signup_month'] = pd.DatetimeIndex(df_test['signup_date']).month
df_test['signup_day'] = pd.DatetimeIndex(df_test['signup_date']).day

df_test.head()

Unnamed: 0,id,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,...,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,created_year,created_month,created_day,signup_year,signup_month,signup_day
0,39162,2022-01-01,2,2,2.0,2021-08-17,1,1,0,0,...,0,0,1,0,2022,1,1,2021,8,17
1,39163,2022-01-01,4,7,3.0,2020-05-21,1,0,0,0,...,0,0,0,0,2022,1,1,2020,5,21
2,39164,2022-01-01,8,7,2.0,2022-03-06,0,0,0,0,...,0,0,0,0,2022,1,1,2022,3,6
3,39165,2022-01-01,9,8,2.0,2020-06-22,0,0,0,0,...,0,0,2,0,2022,1,1,2020,6,22
4,39166,2022-01-01,4,5,2.0,2021-03-10,1,0,0,0,...,0,0,0,0,2022,1,1,2021,3,10


In [120]:
# re-arranging columns

test = df_test[['id', 'created_at', 'created_year', 'created_month', 'created_day', 'campaign_var_1', 'campaign_var_2', 'products_purchased',
                  'signup_date', 'signup_year', 'signup_month', 'signup_day', 'user_activity_var_1', 'user_activity_var_2', 'user_activity_var_3',
                  'user_activity_var_4', 'user_activity_var_5', 'user_activity_var_6', 'user_activity_var_7', 'user_activity_var_8',
                  'user_activity_var_9', 'user_activity_var_10', 'user_activity_var_11', 'user_activity_var_12']]

In [121]:
# drop columns 'created_at' and 'signup_date' since their data is extracted and they're not needed anymore

test.drop(['created_at', 'signup_date'], axis=1, inplace = True)

In [122]:
predictions = xgb.predict(test)
predictions_int = [int(round(value)) for value in predictions]


### __Submission__

In [124]:
submission_df = pd.read_csv("https://raw.githubusercontent.com/bhupeshmahara/smart-lead-scoring-engine/main/sample_submission.csv")
print(submission_df.shape)
submission_df.head()

(13184, 2)


Unnamed: 0,id,buy
0,39162,1
1,39163,1
2,39164,1
3,39165,1
4,39166,1


In [125]:
submission_df['buy'] = predictions_int
submission_df.head()

Unnamed: 0,id,buy
0,39162,0
1,39163,0
2,39164,0
3,39165,0
4,39166,0


In [127]:
submission_df.to_csv('submission.csv', index=False)

In [None]:
# train_df['products_purchased'].value_counts()
# train_df['products_purchased'] = np.where((train_df['buy'] == 1) & (train_df['products_purchased'].isnull()), train_df['products_purchased'].mode(), 0)
# train_df.isnull().sum()
# train_df['products_purchased'].value_counts()