In [1]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report
)
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_excel('dataset-new.xlsx')
df = df[list(df)[:31]]

In [3]:
df

Unnamed: 0,Corporate,Zone,Region,Branch,Branch Code,Lead ID,Lead Product,Lead Amount,Lead Rating,Lead Source,...,Date of Birth,Age,Gender,Purpose Category,Occupation,Marital Status,Qualification,Appointment Date,Assign To Name,Time In StatusCode
0,NORTH,DELHI,DELHI NORTH WEST,DELHI-MANGOLPUR KALN,1311,42238330,Gold Loan,9500,Hot,Walk-in,...,31048.0,36 Years,Female,Personal,House Wife,Married,Graduate,,PURNIMA,6 D
1,NORTH,NORTH,KARNAL,ROORKEE-(UT),2695,41260593,Gold Loan,30000,Warm,Walk-in,...,35626.0,23 Years,Male,Personal,Self Employed,Married,Graduate,,SINGH,31 D
2,SOUTH,KARNATAKA,MYSORE,BELUR - HASSAN DIST,4046,41260595,Gold Loan,14000,Hot,Marketing Activity,...,32143.0,33 Years,Male,Personal,Agriculture/Related Activities,Married,School,,N,31 D
3,NORTH,CENTRAL,NAGPUR,REWA-(MP),2919,42238329,Gold Loan,20000,Hot,Sales Activity,...,34201.0,27 Years,Male,Personal,Salaried,Single,Graduate,,RUBY,6 D
4,SOUTH,TELANGANA,CYBERABAD,HYDERABAD - ERRAGADDA,3703,42238322,Gold Loan,8400,Cold,Marketing Activity,...,33386.0,29 Years,Male,Personal,Salaried,Married,Graduate,,VINOD KUMAR,6 D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,NORTH,WEST,NAVI MUMBAI,MUMBAI - THANE,1010,41776763,Gold Loan,2000,Hot,Digital,...,,,Male,Others,,,,,Shivani,2 D
992,NORTH,BHUBANESWAR,BHUBANESWAR,CUTTACK - MAHANADI VIHAR,3535,41256697,Gold Loan,2000,Hot,Digital,...,,,,Personal,,,,,SEN,31 D
993,NORTH,NORTH,KARNAL,PUNDRI - (HA),3288,41738700,Gold Loan,2000,Cold,Marketing Activity,...,,,,Business,,,,44086.513889,DEEPIKA,17 D
994,NORTH,BHUBANESWAR,VARANASI,SIDDHARTH NAGAR,5038,41727678,Gold Loan,50000,Warm,Call Centre,...,,,,Agriculture,,,,44098.500000,SHUKLA,8 D


In [4]:
list(df)

['Corporate',
 'Zone',
 'Region',
 'Branch',
 'Branch Code',
 'Lead ID',
 'Lead Product',
 'Lead Amount',
 'Lead Rating',
 'Lead Source',
 'Medium',
 'Lead Status Code(Text)',
 'Disbursed Amount',
 'Created By Code',
 'Lead Owner',
 'Lead Created On',
 'Lead Converted On',
 'Assign To Code',
 'Date of Purchase',
 'CIF No.',
 'Existing Customer',
 'Date of Birth',
 'Age',
 'Gender',
 'Purpose Category',
 'Occupation',
 'Marital Status',
 'Qualification',
 'Appointment Date',
 'Assign To Name',
 'Time In StatusCode']

In [5]:
# cat_column = ['Corporate',
#  'Zone',
#  'Lead Product',
#  'Lead Rating',
#  'Lead Source',
#  'Gender',
#  'Purpose Category',
#  'Occupation',
#  'Marital Status',
#  'Qualification','Region','Branch','Medium','Lead Owner']

In [6]:
df['Lead Status Code(Text)'] = df['Lead Status Code(Text)'].replace(['Disbursed', 'Rejected'],[1,0])

In [7]:
# df = pd.get_dummies(df, columns=col1)

In [8]:
# for col in ('Region','Branch','Medium','Lead Owner'):
#     df[col] = df[col].replace(df[col].unique(),list(range(len(df[col].unique()))))

# df['Lead Status Code(Text)'] = df['Lead Status Code(Text)'].replace(['Disbursed', 'Rejected'],[1,0])

In [9]:
df.isna().sum()

Corporate                   0
Zone                        0
Region                      0
Branch                      0
Branch Code                 0
Lead ID                     0
Lead Product                0
Lead Amount                 0
Lead Rating                 0
Lead Source                 0
Medium                      0
Lead Status Code(Text)      0
Disbursed Amount          497
Created By Code             0
Lead Owner                  0
Lead Created On             0
Lead Converted On         521
Assign To Code              0
Date of Purchase          520
CIF No.                     0
Existing Customer           0
Date of Birth             442
Age                       442
Gender                    339
Purpose Category            1
Occupation                447
Marital Status            498
Qualification             465
Appointment Date          669
Assign To Name              0
Time In StatusCode         16
dtype: int64

In [10]:
col_with_less_null = set(df.isna().sum()[df.isna().sum()<len(df)*0.1].index)

In [11]:
df = df[col_with_less_null]

In [12]:
cat_column = ['Branch',
 'Corporate',
 'Lead Product',
 'Lead Rating',
 'Lead Source',
 'Medium',
 'Purpose Category',
 'Region',
 'Zone']

In [13]:
column_to_del = ['Assign To Name', 'Assign To Code', 'Created By Code', 
                 'Existing Customer','CIF No.','Lead Created On','Lead ID','Branch Code',
                 'Lead Owner','Time In StatusCode']

In [14]:
imp_columns = set(list(df)) - set(column_to_del)

In [15]:
imp_columns

{'Branch',
 'Corporate',
 'Lead Amount',
 'Lead Product',
 'Lead Rating',
 'Lead Source',
 'Lead Status Code(Text)',
 'Medium',
 'Purpose Category',
 'Region',
 'Zone'}

In [16]:
df = df[imp_columns]

In [17]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [18]:
# df.select_dtypes(include=['object']).columns

In [19]:
X = df.drop('Lead Status Code(Text)',axis=1)
y = df['Lead Status Code(Text)']

In [20]:
all_cols = list(df)
all_cols.remove('Lead Status Code(Text)')
num_col = list(set(all_cols)-set(df.select_dtypes(include=['object']).columns))

In [21]:
num_col

['Lead Amount']

In [22]:
cat_column

['Branch',
 'Corporate',
 'Lead Product',
 'Lead Rating',
 'Lead Source',
 'Medium',
 'Purpose Category',
 'Region',
 'Zone']

In [23]:
num_pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=5)),
        ('std_scaler', StandardScaler()),
    ])

num_attribs = num_col #list(housing_num)
cat_attribs = cat_column

# will be applying tranasformer object to the given subset of data
# NOTE: e.g., the "num_pipeline" transformer object will be applied on the given set of columns
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

 # sending the data through a chain of transformations
 # NOTE: the final dataset is a 2D numpy array containg all numerical data

In [24]:
X = full_pipeline.fit_transform(X)

In [25]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [26]:
X_train.shape , X_test.shape

((747, 804), (249, 804))

# Logistic regression

In [27]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [28]:
y_pred = classifier.predict(X_test)

In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       120
           1       0.91      0.92      0.92       129

    accuracy                           0.91       249
   macro avg       0.91      0.91      0.91       249
weighted avg       0.91      0.91      0.91       249



In [30]:
pd.DataFrame({'Y_True':y_test,'Y_Pred':y_pred})

Unnamed: 0,Y_True,Y_Pred
832,0,0
970,0,0
96,1,1
587,0,0
450,1,1
266,1,1
290,1,1
158,1,1
668,0,0
572,0,0


In [31]:
print(round(accuracy_score(y_test,y_pred)*100,2))

91.16


# Decision Tree

In [32]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [33]:
y_pred = classifier.predict(X_test)

In [34]:
print(accuracy_score(y_test,y_pred)*100)

91.96787148594377


In [35]:
accuracy_score(y_test,y_pred)*100

91.96787148594377

# Random forest

In [36]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [37]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test,y_pred)*100)

92.3694779116466


# correlation

In [38]:
import numpy as np

In [39]:
df.corr().unstack().sort_values(ascending=False).drop_duplicates()

Lead Amount  Lead Amount               1.000000
             Lead Status Code(Text)    0.183083
dtype: float64

# columns required for prediction

In [40]:
# list(df)

imp_col = ['Branch',
 'Lead Product',
 'Purpose Category',
 'Region',
 'Medium',
 'Lead Source',
 'Lead Rating',
 'Zone',
 'Corporate',
 'Lead Amount']

In [41]:
X_for_prediction = df[imp_col].iloc[:2]
X_for_prediction

Unnamed: 0,Branch,Lead Product,Purpose Category,Region,Medium,Lead Source,Lead Rating,Zone,Corporate,Lead Amount
0,DELHI-MANGOLPUR KALN,Gold Loan,Personal,DELHI NORTH WEST,Bit Notices,Walk-in,Hot,DELHI,NORTH,9500
1,ROORKEE-(UT),Gold Loan,Personal,KARNAL,Bit Notices,Walk-in,Warm,NORTH,NORTH,30000


In [42]:
X_processed = full_pipeline.transform(X_for_prediction)

In [43]:
X_processed

<2x804 sparse matrix of type '<class 'numpy.float64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [44]:
# data = np.zeros((len(X_for_prediction),X_train.shape[1]))

In [45]:
# data[:,:X_processed.shape[1]] = X_processed

In [46]:
# data

In [47]:
y_pred = classifier.predict(X_processed)

In [48]:
X_for_prediction

Unnamed: 0,Branch,Lead Product,Purpose Category,Region,Medium,Lead Source,Lead Rating,Zone,Corporate,Lead Amount
0,DELHI-MANGOLPUR KALN,Gold Loan,Personal,DELHI NORTH WEST,Bit Notices,Walk-in,Hot,DELHI,NORTH,9500
1,ROORKEE-(UT),Gold Loan,Personal,KARNAL,Bit Notices,Walk-in,Warm,NORTH,NORTH,30000


In [49]:
X_for_prediction['Predicted_Result'] = y_pred

In [50]:
X_for_prediction

Unnamed: 0,Branch,Lead Product,Purpose Category,Region,Medium,Lead Source,Lead Rating,Zone,Corporate,Lead Amount,Predicted_Result
0,DELHI-MANGOLPUR KALN,Gold Loan,Personal,DELHI NORTH WEST,Bit Notices,Walk-in,Hot,DELHI,NORTH,9500,1
1,ROORKEE-(UT),Gold Loan,Personal,KARNAL,Bit Notices,Walk-in,Warm,NORTH,NORTH,30000,1
