# Marketing Leads Conversion - Data Sanity Checks
- Join Data
- Identify Data Types - Continuous, Categorical, Date
    - Convert to right Data Type
- Identify Missing Values
- Impute Missing Values
    - Continuous
    - Categorical
- Create Features
    - Lead_Creation_Date
    - Existing EMI Percentage

## 1/ Import Libraries

In [1]:
# pandas, numpy
import pandas as pd
import numpy as np

## 2/ Import Data
- t1_leads
- t6_emi
- test

**Extra:** Join other tables to build a more comprehensive training data

In [2]:
t1_leads = pd.read_csv('t1_leads.csv', low_memory=False)

t2_city = pd.read_csv('t2_city.csv', low_memory=False)
t3_employer = pd.read_csv('t3_employer.csv', low_memory=False)
t4_bank = pd.read_csv('t4_bank.csv', low_memory=False)
t5_source = pd.read_csv('t5_source.csv', low_memory=False)
t6_emi = pd.read_csv('t6_emi.csv', low_memory=False)

#Test Data - Final Predictions
test = pd.read_csv('test.csv', low_memory=False)

### Check first 5 rows

In [3]:
t1_leads.head()

Unnamed: 0,ID,Gender,DOB,Lead_Creation_Date,City_Code,Employer_Code,Monthly_Income,Customer_Existing_Primary_Bank_Code,Contacted,Source,Existing_EMI,Loan_Amount,Loan_Period,Var1,Approved
0,APPC90493171225,Female,23-07-1979,15-07-2016,C10001,COM0044082,2000.0,B001,N,S122,0.0,,,0,0
1,APPD40611263344,Male,07-12-1986,04-07-2016,C10003,COM0000002,3500.0,B002,Y,S122,0.0,20000.0,2.0,10,0
2,APPE70289249423,Male,10-12-1982,19-07-2016,C10125,COM0005267,2250.0,B003,Y,S143,0.0,45000.0,4.0,0,0
3,APPF80273865537,Male,30-01-1989,09-07-2016,C10477,COM0004143,3500.0,B003,Y,S143,0.0,92000.0,5.0,7,0
4,APPG60994436641,Male,19-04-1985,20-07-2016,C10002,COM0001781,10000.0,B001,Y,S134,2500.0,50000.0,2.0,10,0


In [4]:
#t3_employer
t3_employer.head()

Unnamed: 0,Employer_Code,Employer_Category1,Employer_Category2
0,COM0044082,A,4.0
1,COM0000002,C,1.0
2,COM0005267,C,4.0
3,COM0004143,A,4.0
4,COM0001781,A,4.0


In [5]:
#test
test.head()

Unnamed: 0,ID,Gender,DOB,Lead_Creation_Date,City_Code,City_Category,Employer_Code,Employer_Category1,Employer_Category2,Monthly_Income,...,Primary_Bank_Type,Contacted,Source,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1
0,APPA70109647212,Male,03/06/88,05/07/16,C10028,C,COM0002222,A,4.0,2150.0,...,P,Y,S122,B,0.0,10000.0,3.0,20.0,372.0,4
1,APPB10687939341,Male,13/07/81,01/07/16,C10003,A,COM0001784,C,4.0,4200.0,...,P,Y,S133,C,0.0,69000.0,5.0,24.0,1985.0,7
2,APPC80449411414,Female,19/11/90,01/07/16,C10009,B,COM0045260,B,4.0,1000.0,...,P,N,S133,B,0.0,,,,,0
3,APPD30665094501,Female,15/10/92,01/07/16,C10005,A,COM0000085,A,3.0,1465.0,...,P,N,S133,C,0.0,,,,,0
4,APPE80379821637,Male,21/09/88,01/07/16,C10005,A,COM0006422,A,4.0,2340.0,...,P,Y,S143,B,500.0,10000.0,2.0,,,0


## 3/ Join/Merge Data

In [6]:
# Which join should you use?
# 70000 Applicants - Only 22277 have EMI and Interest Rate?
# Left - t1_leads
# Right - t2_city
train = pd.merge(t1_leads, t2_city, how = 'left',on = 'City_Code')

In [7]:
# Join Employer - t3_employer
train = pd.merge(train, t3_employer, how = 'left',on = 'Employer_Code')

In [8]:
# Join Bank - t4_bank
train = pd.merge(train, t4_bank, how = 'left',on = 'Customer_Existing_Primary_Bank_Code')

In [9]:
# Join Bank - t5_source
train = pd.merge(train, t5_source, how = 'left',on = 'Source')

In [10]:
# Join Bank - t6_emi
train = pd.merge(train, t6_emi, how = 'left',on = 'ID')

### Check Merged Data

In [11]:
train.head()

Unnamed: 0,ID,Gender,DOB,Lead_Creation_Date,City_Code,Employer_Code,Monthly_Income,Customer_Existing_Primary_Bank_Code,Contacted,Source,...,Loan_Period,Var1,Approved,City_Category,Employer_Category1,Employer_Category2,Primary_Bank_Type,Source_Category,Interest_Rate,EMI
0,APPC90493171225,Female,23-07-1979,15-07-2016,C10001,COM0044082,2000.0,B001,N,S122,...,,0,0,A,A,4.0,P,G,,
1,APPC90493171225,Female,23-07-1979,15-07-2016,C10001,COM0044082,2000.0,B001,N,S122,...,,0,0,A,A,4.0,P,C,,
2,APPC90493171225,Female,23-07-1979,15-07-2016,C10001,COM0044082,2000.0,B001,N,S122,...,,0,0,A,A,4.0,P,B,,
3,APPC90493171225,Female,23-07-1979,15-07-2016,C10001,COM0044082,2000.0,B001,N,S122,...,,0,0,A,A,4.0,P,E,,
4,APPC90493171225,Female,23-07-1979,15-07-2016,C10001,COM0044082,2000.0,B001,N,S122,...,,0,0,A,A,4.0,P,F,,


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 319900 entries, 0 to 319899
Data columns (total 22 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   ID                                   319900 non-null  object 
 1   Gender                               319900 non-null  object 
 2   DOB                                  319810 non-null  object 
 3   Lead_Creation_Date                   319900 non-null  object 
 4   City_Code                            315016 non-null  object 
 5   Employer_Code                        296373 non-null  object 
 6   Monthly_Income                       319900 non-null  float64
 7   Customer_Existing_Primary_Bank_Code  271208 non-null  object 
 8   Contacted                            319900 non-null  object 
 9   Source                               319900 non-null  object 
 10  Existing_EMI                         319596 non-null  float64
 11  Loan_Amount  

## 4/ Convert String to Date Format

### Convert: Lead_Creation_Date

In [13]:
# Train
train['Lead_Creation_Date'] = pd.to_datetime(train['Lead_Creation_Date'], infer_datetime_format = True)

# Test (need to do the same for Test too)
test['Lead_Creation_Date'] = pd.to_datetime(test['Lead_Creation_Date'], infer_datetime_format = True)

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 319900 entries, 0 to 319899
Data columns (total 22 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   ID                                   319900 non-null  object        
 1   Gender                               319900 non-null  object        
 2   DOB                                  319810 non-null  object        
 3   Lead_Creation_Date                   319900 non-null  datetime64[ns]
 4   City_Code                            315016 non-null  object        
 5   Employer_Code                        296373 non-null  object        
 6   Monthly_Income                       319900 non-null  float64       
 7   Customer_Existing_Primary_Bank_Code  271208 non-null  object        
 8   Contacted                            319900 non-null  object        
 9   Source                               319900 non-null  object        
 

### Convert: DOB

In [15]:
# Train
train['DOB'] = pd.to_datetime(train['DOB'], infer_datetime_format = True)

# Test
test['DOB'] = pd.to_datetime(test['DOB'], infer_datetime_format = True)

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 319900 entries, 0 to 319899
Data columns (total 22 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   ID                                   319900 non-null  object        
 1   Gender                               319900 non-null  object        
 2   DOB                                  319810 non-null  datetime64[ns]
 3   Lead_Creation_Date                   319900 non-null  datetime64[ns]
 4   City_Code                            315016 non-null  object        
 5   Employer_Code                        296373 non-null  object        
 6   Monthly_Income                       319900 non-null  float64       
 7   Customer_Existing_Primary_Bank_Code  271208 non-null  object        
 8   Contacted                            319900 non-null  object        
 9   Source                               319900 non-null  object        
 

## 5/ Identify Missing Values

### Option 1: info

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 319900 entries, 0 to 319899
Data columns (total 22 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   ID                                   319900 non-null  object        
 1   Gender                               319900 non-null  object        
 2   DOB                                  319810 non-null  datetime64[ns]
 3   Lead_Creation_Date                   319900 non-null  datetime64[ns]
 4   City_Code                            315016 non-null  object        
 5   Employer_Code                        296373 non-null  object        
 6   Monthly_Income                       319900 non-null  float64       
 7   Customer_Existing_Primary_Bank_Code  271208 non-null  object        
 8   Contacted                            319900 non-null  object        
 9   Source                               319900 non-null  object        
 

### Option 2: isnull + sum

In [18]:
train.isnull().sum()

ID                                          0
Gender                                      0
DOB                                        90
Lead_Creation_Date                          0
City_Code                                4884
Employer_Code                           23527
Monthly_Income                              0
Customer_Existing_Primary_Bank_Code     48692
Contacted                                   0
Source                                      0
Existing_EMI                              304
Loan_Amount                            130885
Loan_Period                            130885
Var1                                        0
Approved                                    0
City_Category                            4884
Employer_Category1                      23527
Employer_Category2                      24912
Primary_Bank_Type                       48692
Source_Category                             0
Interest_Rate                          220786
EMI                               

### Option 3: Percentage Missing

In [19]:
train.isnull().sum()/len(train)

ID                                     0.000000
Gender                                 0.000000
DOB                                    0.000281
Lead_Creation_Date                     0.000000
City_Code                              0.015267
Employer_Code                          0.073545
Monthly_Income                         0.000000
Customer_Existing_Primary_Bank_Code    0.152210
Contacted                              0.000000
Source                                 0.000000
Existing_EMI                           0.000950
Loan_Amount                            0.409143
Loan_Period                            0.409143
Var1                                   0.000000
Approved                               0.000000
City_Category                          0.015267
Employer_Category1                     0.073545
Employer_Category2                     0.077874
Primary_Bank_Type                      0.152210
Source_Category                        0.000000
Interest_Rate                          0

## **Additional : Export Missing Value Perc Data
- Convert Series (Index + Values) to DataFrame
- Export Table (use name - missing_value_summary.csv)

In [20]:
# Create Series of Percentage Missing
missing_value_percentage = (train.isnull().sum()/len(train))

In [21]:
missing_value_percentage.values

array([0.00000000e+00, 0.00000000e+00, 2.81337918e-04, 0.00000000e+00,
       1.52672710e-02, 7.35448578e-02, 0.00000000e+00, 1.52210066e-01,
       0.00000000e+00, 0.00000000e+00, 9.50296968e-04, 4.09143482e-01,
       4.09143482e-01, 0.00000000e+00, 0.00000000e+00, 1.52672710e-02,
       7.35448578e-02, 7.78743357e-02, 1.52210066e-01, 0.00000000e+00,
       6.90171929e-01, 6.90171929e-01])

In [22]:
# Check Data Type
type(missing_value_percentage)

pandas.core.series.Series

In [23]:
# Create DataFrame
missing_value_summary = pd.DataFrame({
   'Feature_Name' : missing_value_percentage.index,
    'Missing_Percentage' : missing_value_percentage.values
})

In [24]:
#missing_value_summary

In [25]:
#Export DataFrame
missing_value_summary.to_csv('missing_value_summary.csv', index = False)

## 6/ Impute Missing Values

### 1/ Continuous - Indentify and Impute

In [26]:
numerical_features = [feature for feature in train.columns if train[feature].dtypes!='O' and feature !='ID' and feature !='Approved']
numerical_features

['DOB',
 'Lead_Creation_Date',
 'Monthly_Income',
 'Existing_EMI',
 'Loan_Amount',
 'Loan_Period',
 'Var1',
 'Employer_Category2',
 'Interest_Rate',
 'EMI']

In [27]:
# Check Missing
train[numerical_features].isnull().sum()

DOB                       90
Lead_Creation_Date         0
Monthly_Income             0
Existing_EMI             304
Loan_Amount           130885
Loan_Period           130885
Var1                       0
Employer_Category2     24912
Interest_Rate         220786
EMI                   220786
dtype: int64

#### Option 1: fillna - default options
- bfill
- ffill

In [28]:
train['Monthly_Income'].fillna(method = 'bfill').isnull().sum()
train['Existing_EMI'].fillna(method = 'bfill').isnull().sum()
train['Loan_Amount'].fillna(method = 'bfill').isnull().sum()
train['Loan_Period'].fillna(method = 'bfill').isnull().sum()
train['Var1'].fillna(method = 'bfill').isnull().sum()

0

#### Option 2: fillna with mean, median, mode

In [29]:
train['Monthly_Income'].fillna(train['Monthly_Income'].median())
train['Existing_EMI'].fillna(train['Existing_EMI'].median())
train['Loan_Amount'].fillna(train['Loan_Amount'].median())
train['Loan_Period'].fillna(train['Loan_Period'].median())
train['Var1'].fillna(train['Var1'].median())

0          0
1          0
2          0
3          0
4          0
          ..
319895    10
319896    10
319897    10
319898    10
319899    10
Name: Var1, Length: 319900, dtype: int64

#### Commit Changes to Missing Value Column

In [30]:
train['Monthly_Income'] = train['Monthly_Income'].fillna(train['Monthly_Income'].median())
train['Existing_EMI'] = train['Existing_EMI'].fillna(train['Existing_EMI'].median())
train['Loan_Amount'] = train['Loan_Amount'].fillna(train['Loan_Amount'].median())
train['Loan_Period'] = train['Loan_Period'].fillna(train['Loan_Period'].median())
train['Var1'] = train['Var1'].fillna(train['Var1'].median())
train['Employer_Category2'] = train['Employer_Category2'].fillna(train['Var1'].median())
train['Interest_Rate'] = train['Interest_Rate'].fillna(train['Interest_Rate'].median())
train['EMI'] = train['EMI'].fillna(train['EMI'].median())

#### Check Again

In [31]:
train.isnull().sum()

ID                                         0
Gender                                     0
DOB                                       90
Lead_Creation_Date                         0
City_Code                               4884
Employer_Code                          23527
Monthly_Income                             0
Customer_Existing_Primary_Bank_Code    48692
Contacted                                  0
Source                                     0
Existing_EMI                               0
Loan_Amount                                0
Loan_Period                                0
Var1                                       0
Approved                                   0
City_Category                           4884
Employer_Category1                     23527
Employer_Category2                         0
Primary_Bank_Type                      48692
Source_Category                            0
Interest_Rate                              0
EMI                                        0
dtype: int

### 3/ Categorical - Identify and Impute

In [32]:
categorical_features = [feature for feature in train.columns if train[feature].dtypes=='O' and feature !='ID' and feature !='Approved']
categorical_features

['Gender',
 'City_Code',
 'Employer_Code',
 'Customer_Existing_Primary_Bank_Code',
 'Contacted',
 'Source',
 'City_Category',
 'Employer_Category1',
 'Primary_Bank_Type',
 'Source_Category']

In [33]:
# Check Missing
train[categorical_features].isnull().sum()

Gender                                     0
City_Code                               4884
Employer_Code                          23527
Customer_Existing_Primary_Bank_Code    48692
Contacted                                  0
Source                                     0
City_Category                           4884
Employer_Category1                     23527
Primary_Bank_Type                      48692
Source_Category                            0
dtype: int64

#### Option 1 - fillna (missing)

In [34]:
train['DOB'].fillna("missing")
train['City_Code'].fillna("missing")
train['Employer_Code'].fillna("missing")
train['Customer_Existing_Primary_Bank_Code'].fillna("missing")
train['City_Category'].fillna("missing")
train['Employer_Category1'].fillna("missing")
train['Primary_Bank_Type'].fillna("missing")

0               P
1               P
2               P
3               P
4               P
           ...   
319895    missing
319896    missing
319897    missing
319898    missing
319899    missing
Name: Primary_Bank_Type, Length: 319900, dtype: object

#### Option 2 - fillna (mode)

In [35]:
train['City_Category'].mode()[0]

'A'

In [36]:
train['City_Category'] = train['City_Category'].fillna(train['City_Category'].mode()[0])

#### Commit Changes

In [37]:
train['DOB'] = train['DOB'].fillna("missing")
train['City_Code'] = train['City_Code'].fillna("missing")
train['Employer_Code'] = train['Employer_Code'].fillna("missing")
train['Customer_Existing_Primary_Bank_Code'] = train['Customer_Existing_Primary_Bank_Code'].fillna("missing")
train['City_Category'] = train['City_Category'].fillna("missing")
train['Employer_Category1'] = train['Employer_Category1'].fillna("missing")
train['Primary_Bank_Type'] = train['Primary_Bank_Type'].fillna("missing")

In [38]:
train.isnull().sum()

ID                                     0
Gender                                 0
DOB                                    0
Lead_Creation_Date                     0
City_Code                              0
Employer_Code                          0
Monthly_Income                         0
Customer_Existing_Primary_Bank_Code    0
Contacted                              0
Source                                 0
Existing_EMI                           0
Loan_Amount                            0
Loan_Period                            0
Var1                                   0
Approved                               0
City_Category                          0
Employer_Category1                     0
Employer_Category2                     0
Primary_Bank_Type                      0
Source_Category                        0
Interest_Rate                          0
EMI                                    0
dtype: int64

## 7/ Build Model

## Prepare Data

In [39]:
#create X and y datasets for splitting 
X = train.drop(['ID', 'Approved','DOB', 'Lead_Creation_Date', 'City_Code', 'Employer_Code', 'Customer_Existing_Primary_Bank_Code', 'Source'], axis=1)
y = train['Approved']

## Create Dummy

In [40]:
X = pd.get_dummies(X, drop_first=True)

## Import LR Package and create Instance

In [41]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

## Build Model

In [42]:
lr.fit(X,y)

LogisticRegression()

### Check Score

In [43]:
lr.score(X, y)

0.9846201938105658