# Marketing Leads Conversion - Missing Data
- Identify Missing Values
- Impute Missing Values
    - Continuous
    - Categorical

## 1/ Import Libraries

In [6]:
# pandas, numpy
import pandas as pd
import numpy as np

## 2/ Import Data
- t1_leads
- t6_emi
- test

**Extra:** Join other tables to build a more comprehensive training data

In [2]:
# Train
train = pd.read_csv('data/train.csv', low_memory=False)

#Test Data - Final Predictions
test = pd.read_csv('data/test.csv', low_memory=False)

### Check first 5 rows

In [7]:
train.head()

Unnamed: 0,ID,Gender,DOB,Lead_Creation_Date,City_Code,City_Category,Employer_Code,Employer_Category1,Employer_Category2,Monthly_Income,...,Contacted,Source,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved
0,APPC90493171225,Female,1979-07-23,2016-07-15,C10001,A,COM0044082,A,4.0,2000.0,...,N,S122,G,0.0,,,,,0,0
1,APPD40611263344,Male,1986-07-12,2016-04-07,C10003,A,COM0000002,C,1.0,3500.0,...,Y,S122,G,0.0,20000.0,2.0,13.25,953.0,10,0
2,APPE70289249423,Male,1982-10-12,2016-07-19,C10125,C,COM0005267,C,4.0,2250.0,...,Y,S143,B,0.0,45000.0,4.0,,,0,0
3,APPF80273865537,Male,1989-01-30,2016-09-07,C10477,C,COM0004143,A,4.0,3500.0,...,Y,S143,B,0.0,92000.0,5.0,,,7,0
4,APPG60994436641,Male,1985-04-19,2016-07-20,C10002,A,COM0001781,A,4.0,10000.0,...,Y,S134,B,2500.0,50000.0,2.0,,,10,0


In [8]:
#test
test.head()

Unnamed: 0,ID,Gender,DOB,Lead_Creation_Date,City_Code,City_Category,Employer_Code,Employer_Category1,Employer_Category2,Monthly_Income,...,Primary_Bank_Type,Contacted,Source,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1
0,APPA70109647212,Male,1988-03-06,2016-05-07,C10028,C,COM0002222,A,4.0,2150.0,...,P,Y,S122,B,0.0,10000.0,3.0,20.0,372.0,4
1,APPB10687939341,Male,1981-07-13,2016-01-07,C10003,A,COM0001784,C,4.0,4200.0,...,P,Y,S133,C,0.0,69000.0,5.0,24.0,1985.0,7
2,APPC80449411414,Female,1990-11-19,2016-01-07,C10009,B,COM0045260,B,4.0,1000.0,...,P,N,S133,B,0.0,,,,,0
3,APPD30665094501,Female,1992-10-15,2016-01-07,C10005,A,COM0000085,A,3.0,1465.0,...,P,N,S133,C,0.0,,,,,0
4,APPE80379821637,Male,1988-09-21,2016-01-07,C10005,A,COM0006422,A,4.0,2340.0,...,P,Y,S143,B,500.0,10000.0,2.0,,,0


## 3/ Convert String to Date Format

### Convert: Lead_Creation_Date

In [9]:
# Train
train['Lead_Creation_Date'] = pd.to_datetime(train['Lead_Creation_Date'], infer_datetime_format = True)
train['DOB'] = pd.to_datetime(train['DOB'], infer_datetime_format = True)

# Test (need to do the same for Test too)
test['Lead_Creation_Date'] = pd.to_datetime(test['Lead_Creation_Date'], infer_datetime_format = True)
test['DOB'] = pd.to_datetime(test['DOB'], infer_datetime_format = True)

## 5/ Identify Missing Values

### Plot a

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69713 entries, 0 to 69712
Data columns (total 22 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   ID                                   69713 non-null  object        
 1   Gender                               69713 non-null  object        
 2   DOB                                  69698 non-null  datetime64[ns]
 3   Lead_Creation_Date                   69713 non-null  datetime64[ns]
 4   City_Code                            68899 non-null  object        
 5   City_Category                        68899 non-null  object        
 6   Employer_Code                        65695 non-null  object        
 7   Employer_Category1                   65695 non-null  object        
 8   Employer_Category2                   65415 non-null  float64       
 9   Monthly_Income                       69713 non-null  float64       
 10  Customer_E

### Option 2: isnull + sum

In [11]:
train.isnull().sum()

ID                                         0
Gender                                     0
DOB                                       15
Lead_Creation_Date                         0
City_Code                                814
City_Category                            814
Employer_Code                           4018
Employer_Category1                      4018
Employer_Category2                      4298
Monthly_Income                             0
Customer_Existing_Primary_Bank_Code     9391
Primary_Bank_Type                       9391
Contacted                                  0
Source                                     0
Source_Category                            0
Existing_EMI                              51
Loan_Amount                            27709
Loan_Period                            27709
Interest_Rate                          47437
EMI                                    47437
Var1                                       0
Approved                                   0
dtype: int

### Option 3: Percentage Missing

In [12]:
train.isnull().sum()/len(train)

ID                                     0.000000
Gender                                 0.000000
DOB                                    0.000215
Lead_Creation_Date                     0.000000
City_Code                              0.011676
City_Category                          0.011676
Employer_Code                          0.057636
Employer_Category1                     0.057636
Employer_Category2                     0.061653
Monthly_Income                         0.000000
Customer_Existing_Primary_Bank_Code    0.134709
Primary_Bank_Type                      0.134709
Contacted                              0.000000
Source                                 0.000000
Source_Category                        0.000000
Existing_EMI                           0.000732
Loan_Amount                            0.397472
Loan_Period                            0.397472
Interest_Rate                          0.680461
EMI                                    0.680461
Var1                                   0

## **Additional : Export Missing Value Perc Data
- Convert Series (Index + Values) to DataFrame
- Export Table (use name - missing_value_summary.csv)

In [13]:
# Create Series of Percentage Missing
missing_value_percentage = (train.isnull().sum()/len(train))

In [14]:
missing_value_percentage.values

array([0.00000000e+00, 0.00000000e+00, 2.15167903e-04, 0.00000000e+00,
       1.16764449e-02, 1.16764449e-02, 5.76363089e-02, 5.76363089e-02,
       6.16527764e-02, 0.00000000e+00, 1.34709452e-01, 1.34709452e-01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.31570869e-04,
       3.97472494e-01, 3.97472494e-01, 6.80461320e-01, 6.80461320e-01,
       0.00000000e+00, 0.00000000e+00])

In [15]:
# Check Data Type
type(missing_value_percentage)

pandas.core.series.Series

In [16]:
# Create DataFrame
missing_value_summary = pd.DataFrame({
   'Feature_Name' : missing_value_percentage.index,
    'Missing_Percentage' : missing_value_percentage.values
})

In [17]:
#missing_value_summary

In [18]:
#Export DataFrame
missing_value_summary.to_csv('missing_value_summary.csv', index = False)

## 6/ Impute Missing Values

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69713 entries, 0 to 69712
Data columns (total 22 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   ID                                   69713 non-null  object        
 1   Gender                               69713 non-null  object        
 2   DOB                                  69698 non-null  datetime64[ns]
 3   Lead_Creation_Date                   69713 non-null  datetime64[ns]
 4   City_Code                            68899 non-null  object        
 5   City_Category                        68899 non-null  object        
 6   Employer_Code                        65695 non-null  object        
 7   Employer_Category1                   65695 non-null  object        
 8   Employer_Category2                   65415 non-null  float64       
 9   Monthly_Income                       69713 non-null  float64       
 10  Customer_E

In [20]:
train.dtypes

ID                                             object
Gender                                         object
DOB                                    datetime64[ns]
Lead_Creation_Date                     datetime64[ns]
City_Code                                      object
City_Category                                  object
Employer_Code                                  object
Employer_Category1                             object
Employer_Category2                            float64
Monthly_Income                                float64
Customer_Existing_Primary_Bank_Code            object
Primary_Bank_Type                              object
Contacted                                      object
Source                                         object
Source_Category                                object
Existing_EMI                                  float64
Loan_Amount                                   float64
Loan_Period                                   float64
Interest_Rate               

### 1/ Continuous - Indentify and Impute

In [31]:
numerical_features = [feature for feature in train.columns if train[feature].dtypes in ['float64','int64'] and feature !='ID' and feature !='Approved']
numerical_features

['Employer_Category2',
 'Monthly_Income',
 'Existing_EMI',
 'Loan_Amount',
 'Loan_Period',
 'Interest_Rate',
 'EMI',
 'Var1']

In [32]:
# Check Missing
train[numerical_features].isnull().sum()

Employer_Category2     4298
Monthly_Income            0
Existing_EMI             51
Loan_Amount           27709
Loan_Period           27709
Interest_Rate         47437
EMI                   47437
Var1                      0
dtype: int64

#### Impute Numerical with SimpleImputer

In [33]:
# import package
from sklearn.impute import SimpleImputer

In [34]:
# Create Instance
imp_num = SimpleImputer(strategy='median')

In [35]:
# Fit - Calculate Median for all columns
imp_num.fit(train[numerical_features])

SimpleImputer(strategy='median')

In [38]:
# Transform - Impute Missing Values with Median
train[numerical_features] = imp_num.transform(train[numerical_features])

#### Check Again

In [39]:
train.isnull().sum()

ID                                        0
Gender                                    0
DOB                                      15
Lead_Creation_Date                        0
City_Code                               814
City_Category                           814
Employer_Code                          4018
Employer_Category1                     4018
Employer_Category2                        0
Monthly_Income                            0
Customer_Existing_Primary_Bank_Code    9391
Primary_Bank_Type                      9391
Contacted                                 0
Source                                    0
Source_Category                           0
Existing_EMI                              0
Loan_Amount                               0
Loan_Period                               0
Interest_Rate                             0
EMI                                       0
Var1                                      0
Approved                                  0
dtype: int64

### 3/ Categorical - Identify and Impute

In [40]:
categorical_features = [feature for feature in train.columns if train[feature].dtypes=='O' and feature !='ID' and feature !='Approved']
categorical_features

['Gender',
 'City_Code',
 'City_Category',
 'Employer_Code',
 'Employer_Category1',
 'Customer_Existing_Primary_Bank_Code',
 'Primary_Bank_Type',
 'Contacted',
 'Source',
 'Source_Category']

In [41]:
# Check Missing
train[categorical_features].isnull().sum()

Gender                                    0
City_Code                               814
City_Category                           814
Employer_Code                          4018
Employer_Category1                     4018
Customer_Existing_Primary_Bank_Code    9391
Primary_Bank_Type                      9391
Contacted                                 0
Source                                    0
Source_Category                           0
dtype: int64

#### Impute Categorical with SimpleImputer

In [37]:
# import package
from sklearn.impute import SimpleImputer

In [43]:
# Create Instance
imp_cat = SimpleImputer(strategy='constant', fill_value = 'missing')

In [44]:
imp_cat.fit(train[categorical_features])

SimpleImputer(fill_value='missing', strategy='constant')

In [46]:
train[categorical_features] = imp_cat.transform(train[categorical_features])

#### Check Again

In [47]:
train.isnull().sum()

ID                                      0
Gender                                  0
DOB                                    15
Lead_Creation_Date                      0
City_Code                               0
City_Category                           0
Employer_Code                           0
Employer_Category1                      0
Employer_Category2                      0
Monthly_Income                          0
Customer_Existing_Primary_Bank_Code     0
Primary_Bank_Type                       0
Contacted                               0
Source                                  0
Source_Category                         0
Existing_EMI                            0
Loan_Amount                             0
Loan_Period                             0
Interest_Rate                           0
EMI                                     0
Var1                                    0
Approved                                0
dtype: int64

### Outlier Treatement

### Tranformation of Numerical Features

In [56]:
from sklearn.preprocessing import StandardScaler

## 7/ Build Model

## Prepare Data

In [48]:
#create X and y datasets for splitting 
X = train.drop(['ID', 'Approved','DOB', 'Lead_Creation_Date', 'City_Code', 'Employer_Code', 'Customer_Existing_Primary_Bank_Code', 'Source'], axis=1)
y = train['Approved']

## Create Dummy

In [51]:
X = pd.get_dummies(X, drop_first=True)

In [53]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69713 entries, 0 to 69712
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Employer_Category2          69713 non-null  float64
 1   Monthly_Income              69713 non-null  float64
 2   Existing_EMI                69713 non-null  float64
 3   Loan_Amount                 69713 non-null  float64
 4   Loan_Period                 69713 non-null  float64
 5   Interest_Rate               69713 non-null  float64
 6   EMI                         69713 non-null  float64
 7   Var1                        69713 non-null  float64
 8   Gender_Male                 69713 non-null  uint8  
 9   City_Category_B             69713 non-null  uint8  
 10  City_Category_C             69713 non-null  uint8  
 11  City_Category_missing       69713 non-null  uint8  
 12  Employer_Category1_B        69713 non-null  uint8  
 13  Employer_Category1_C        697

## Import LR Package and create Instance

In [49]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 1000)

## Build Model

In [54]:
lr.fit(X,y)

LogisticRegression()

### Check Score

In [55]:
lr.score(X, y)

0.9852968599830735