In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_excel('New-dataset.xlsx')

In [3]:
col = ['Created Month','Date of Purchase','Lead ID','Lead Converted On','Lead Created On','Converted Month',
'Converted Year','Date of Birth','CIF No','Assign To Code','Appointment Date','Created by Code']

In [4]:
data = df.copy()

In [5]:
df.drop(col,axis=1,inplace=True)

In [6]:
df.drop(['Purpose of Category','Disbursed Amount','Converted Month No'],axis=1,inplace=True)

In [7]:
df.drop(['Existing Customer','Created Month No','Created Year'],axis=1,inplace=True)

In [8]:
df.nunique()

Qualification             5
Corporate                 2
Lead Rating               4
Lead Product             20
Age                      77
Gender                    2
Branch Code            4154
Time In Status Code       4
Zone                     13
Lead Source              14
Medium                  112
Region Code              80
Lead Owner             3939
Occupation               19
Branch                 4155
Marital Status            2
Lead Status Code         16
Lead Amount            2000
dtype: int64

In [9]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report
)
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
df.head().T

Unnamed: 0,0,1,2,3,4
Qualification,Graduate,Graduate,Graduate,Graduate,Graduate
Corporate,NORTH,NORTH,SOUTH,SOUTH,SOUTH
Lead Rating,Hot,Hot,Cold,Hot,Hot
Lead Product,Mutual Fund,Mutual Fund,Mutual Fund,Mutual Fund,Mutual Fund
Age,44.0,31.0,36.0,29.0,30.0
Gender,Male,Female,Female,Male,Male
Branch Code,328,4239,2988,4887,467
Time In Status Code,85,84,87,87,87
Zone,DELHI,WEST,TELANGANA-ANDHRA,KARNATAKA,TAMILNADU SOUTH
Lead Source,Walk-in,Walk-in,Walk-in,Marketing Activity,Marketing Activity


In [11]:
df.dtypes

Qualification           object
Corporate               object
Lead Rating             object
Lead Product            object
Age                    float64
Gender                  object
Branch Code              int64
Time In Status Code      int64
Zone                    object
Lead Source             object
Medium                  object
Region Code             object
Lead Owner              object
Occupation              object
Branch                  object
Marital Status          object
Lead Status Code        object
Lead Amount            float64
dtype: object

In [12]:
df.drop(['Branch','Branch Code','Lead Owner'],axis=1,inplace=True)

In [13]:
df.head()

Unnamed: 0,Qualification,Corporate,Lead Rating,Lead Product,Age,Gender,Time In Status Code,Zone,Lead Source,Medium,Region Code,Occupation,Marital Status,Lead Status Code,Lead Amount
0,Graduate,NORTH,Hot,Mutual Fund,44.0,Male,85,DELHI,Walk-in,Bit Notices,DELHI EAST,Business,Married,Disbursed,5000.0
1,Graduate,NORTH,Hot,Mutual Fund,31.0,Female,84,WEST,Walk-in,Banner,MUMBAI MAIN,Business,Married,Disbursed,2000.0
2,Graduate,SOUTH,Cold,Mutual Fund,36.0,Female,87,TELANGANA-ANDHRA,Walk-in,Bit Notices,HYDERABAD,House Wife,Married,Disbursed,2000.0
3,Graduate,SOUTH,Hot,Mutual Fund,29.0,Male,87,KARNATAKA,Marketing Activity,Branch Activation,MANGALORE,Salaried,Married,Disbursed,2000.0
4,Graduate,SOUTH,Hot,Mutual Fund,30.0,Male,87,TAMILNADU SOUTH,Marketing Activity,Bit Notices,MADURAI,Salaried,Married,Disbursed,2000.0


In [14]:
df['Time In Status Code'] = df['Time In Status Code'].apply(lambda x:str(x))

In [15]:
cat_col = set(list(df.select_dtypes(include=['object']))) - set(['Lead Status Code'])

In [16]:
num_col = ['Age','Time In Status Code','Lead Amount']

In [17]:
df['Lead Status Code'] = np.where(df['Lead Status Code']=='Disbursed',1,0)

In [18]:
num_pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=5)),
        ('std_scaler', StandardScaler()),
    ])

num_attribs = num_col #list(housing_num)
cat_attribs = list(cat_col)

# will be applying tranasformer object to the given subset of data
# NOTE: e.g., the "num_pipeline" transformer object will be applied on the given set of columns
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

 # sending the data through a chain of transformations
 # NOTE: the final dataset is a 2D numpy array containg all numerical data

In [19]:
X = df.drop('Lead Status Code',axis=1)
y = df['Lead Status Code']

In [20]:
X = full_pipeline.fit_transform(X)

In [21]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [22]:
X_train.shape , X_test.shape

((23889, 280), (7964, 280))

In [23]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000,random_state = 42)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42)

In [24]:
prob = lr.predict_proba(X_test)

In [25]:
lr.classes_

array([0, 1])

In [26]:
# probability
prob[:,1]

array([0.85551107, 0.95900624, 0.94331507, ..., 0.78841173, 0.83173718,
       0.98465141])

In [27]:
# column needed for prediction
imp_col = ['Qualification', 'Corporate', 'Lead Rating', 'Lead Product', 'Age', 
           'Gender', 'Time In Status Code', 'Zone', 'Lead Source', 'Medium', 
           'Region Code', 'Occupation', 'Marital Status', 'Lead Amount']

In [28]:
X_for_prediction = df[imp_col].iloc[:2]
X_for_prediction

Unnamed: 0,Qualification,Corporate,Lead Rating,Lead Product,Age,Gender,Time In Status Code,Zone,Lead Source,Medium,Region Code,Occupation,Marital Status,Lead Amount
0,Graduate,NORTH,Hot,Mutual Fund,44.0,Male,85,DELHI,Walk-in,Bit Notices,DELHI EAST,Business,Married,5000.0
1,Graduate,NORTH,Hot,Mutual Fund,31.0,Female,84,WEST,Walk-in,Banner,MUMBAI MAIN,Business,Married,2000.0


In [29]:
X_processed = full_pipeline.transform(X_for_prediction)

In [30]:
prob = lr.predict_proba(X_processed)
X_for_prediction['Probability'] = prob[:,1]

In [31]:
X_for_prediction

Unnamed: 0,Qualification,Corporate,Lead Rating,Lead Product,Age,Gender,Time In Status Code,Zone,Lead Source,Medium,Region Code,Occupation,Marital Status,Lead Amount,Probability
0,Graduate,NORTH,Hot,Mutual Fund,44.0,Male,85,DELHI,Walk-in,Bit Notices,DELHI EAST,Business,Married,5000.0,0.958491
1,Graduate,NORTH,Hot,Mutual Fund,31.0,Female,84,WEST,Walk-in,Banner,MUMBAI MAIN,Business,Married,2000.0,0.94529


In [32]:
X_for_prediction.to_csv('probability.csv')