In [99]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from Funcs import *
import warnings
warnings.filterwarnings("ignore")

In [100]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

data = pd.read_csv("data.csv") 
data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [101]:
X = data.drop(columns='Loan_Status')
y = data['Loan_Status']

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Building Pipelines

### Splitting data into Num and Cat

In [103]:
getNum = FunctionTransformer(numFeat)
getCat = FunctionTransformer(catFeat)

### Imputing categorical missing values based on fraction of existing data

In [104]:
catImputed = FunctionTransformer(getImputedCat)

### Imputing numerical missing values based on Mode

In [105]:
numImputed = FunctionTransformer(getImputedNum)

### Engineering Features

In [106]:
engineerd = FunctionTransformer(getFeatures)

### Choose Final Model 

In [107]:
finalNum = FunctionTransformer(getFinalNum)
finalCat = FunctionTransformer(getFinalCat)

### Creating Dummies for Categorical

In [108]:
cat_dummy = OneHotEncoder()

In [109]:
to_dense = ToDenseTransformer()

###  Choosing Base Model

In [110]:
# Use base_model 
base_model = LogisticRegression()

## Deploying Pipelines

### Data Cleaning and Feature Engineering

In [111]:
cat_processing = Pipeline([('getCat',getCat),('catImputed',catImputed),("dumies",cat_dummy),('to_dense',to_dense)])

In [112]:
num_processing = Pipeline([('getNum',getNum),('numImputed',numImputed),('engineeredNum',engineerd),('finalNum',finalNum)])


### Preprocessing for Modeling

In [113]:
combined_features = FeatureUnion([('cat',cat_processing),('num',num_processing)])

In [114]:
combined_features.fit(X_train)

FeatureUnion(n_jobs=None,
             transformer_list=[('cat',
                                Pipeline(memory=None,
                                         steps=[('getCat',
                                                 FunctionTransformer(accept_sparse=False,
                                                                     check_inverse=True,
                                                                     func=<function catFeat at 0x7f1a16263950>,
                                                                     inv_kw_args=None,
                                                                     inverse_func=None,
                                                                     kw_args=None,
                                                                     validate=False)),
                                                ('catImputed',
                                                 FunctionTransformer(accept_sparse=False,
                                 

### Fitting Model

In [115]:
#Use base_model 
base_model = LogisticRegression()

In [116]:
baseline_pipeline = Pipeline([
   ('features',combined_features),
    ('model',base_model)
])

In [117]:
y_train

285    Y
177    N
24     N
103    Y
349    Y
      ..
335    Y
582    Y
279    Y
28     N
19     Y
Name: Loan_Status, Length: 491, dtype: object

In [118]:
baseline = baseline_pipeline.fit(X_train, y_train)

In [119]:
baseline_preds = baseline_pipeline.predict(X_train)

In [120]:
cm_baseline = confusion_matrix(y_train,baseline_preds)
accuracy_baseline = accuracy_score(y_train,baseline_preds)
print(f'Confusion Matrix:\n {cm_baseline}')
print(f'Accuracy: {accuracy_baseline}')

Confusion Matrix:
 [[ 63  90]
 [  9 329]]
Accuracy: 0.7983706720977597


In [121]:
baseline_preds_test = baseline_pipeline.predict(X_test)

In [122]:
cm_baseline = confusion_matrix(y_test,baseline_preds_test)
accuracy_baseline = accuracy_score(y_test,baseline_preds_test)
print(f'Confusion Matrix:\n {cm_baseline}')
print(f'Accuracy: {accuracy_baseline}')

Confusion Matrix:
 [[15 24]
 [ 1 83]]
Accuracy: 0.7967479674796748


### Grid Search

In [123]:
params = [
    {'model': [LogisticRegression()],
     'model__penalty': ['l1','l2'],
     'model__class_weight':['None', 'balanced'],
     'model__C': [0.1,0.3,0.5,1,2],
     'model__solver': ['newton-cg', 'lbfgs', 'liblinear'],
     'model__max_iter':[50,100,150]},
     
     

    {'model': [XGBClassifier()],
'model__max_depth': [4,6,8], 
'model__n_estimators':[100,150,200], 
'model__learning_rate':[0.01,0.05,0.5,5], 
'model__alpha' : [5,10,15], 
'model__objective' :['reg:linear','reg:squarederror'],
'model__min_child_weight': [2,4,6],
'model__gamma':[np.random.rand(10)]}
]

In [124]:
tuned_grid = GridSearchCV(baseline_pipeline, params, verbose=1, refit=True).fit(X_train, y_train)

Fitting 5 folds for each of 828 candidates, totalling 4140 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 4140 out of 4140 | elapsed:  2.2min finished


In [125]:
print('Final score is: ', tuned_grid.score(X_train, y_train))
print('Best Parameters are: ', tuned_grid.best_params_)

Final score is:  0.8126272912423625
Best Parameters are:  {'model': LogisticRegression(C=0.5, class_weight='None', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False), 'model__C': 0.5, 'model__class_weight': 'None', 'model__max_iter': 100, 'model__penalty': 'l2', 'model__solver': 'newton-cg'}


In [126]:
print('Final score is: ', tuned_grid.score(X_test, y_test))
print('Best Parameters are: ', tuned_grid.best_params_)

Final score is:  0.8130081300813008
Best Parameters are:  {'model': LogisticRegression(C=0.5, class_weight='None', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False), 'model__C': 0.5, 'model__class_weight': 'None', 'model__max_iter': 100, 'model__penalty': 'l2', 'model__solver': 'newton-cg'}


### Fitting Best Model

In [127]:
best_model = tuned_grid.best_params_['model']

tuned_model_pipeline = Pipeline([
    ('features',combined_features),
    ('model',best_model)
])

In [128]:
tuned_model_pipeline

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('cat',
                                                 Pipeline(memory=None,
                                                          steps=[('getCat',
                                                                  FunctionTransformer(accept_sparse=False,
                                                                                      check_inverse=True,
                                                                                      func=<function catFeat at 0x7f1a16263950>,
                                                                                      inv_kw_args=None,
                                                                                      inverse_func=None,
                                                                                      kw_args=None,
                                                    

In [129]:
tuned_model_pipeline = tuned_model_pipeline.fit(X_train,y_train)

In [130]:
tuned_model_preds_train = tuned_model_pipeline.predict(X_train)

In [131]:
cm_tuned_model = confusion_matrix(y_train,tuned_model_preds_train)
accuracy_tuned_model  = accuracy_score(y_train,tuned_model_preds_train)
print(f'Confusion Matrix:\n {cm_tuned_model}')
print(f'Accuracy: {accuracy_tuned_model}')

Confusion Matrix:
 [[ 67  86]
 [  6 332]]
Accuracy: 0.8126272912423625


In [132]:
tuned_model_preds_test = tuned_model_pipeline.predict(X_test)

In [133]:
cm_tuned_model = confusion_matrix(y_test,tuned_model_preds_test)
accuracy_tuned_model  = accuracy_score(y_test,tuned_model_preds_test)
print(f'Confusion Matrix:\n {cm_tuned_model}')
print(f'Accuracy: {accuracy_tuned_model}')

Confusion Matrix:
 [[18 21]
 [ 2 82]]
Accuracy: 0.8130081300813008


## 6. Deploy your model to cloud and test it with PostMan, BASH or Python

In [134]:
import pickle

In [135]:
filename = 'tuned_model'
pickle.dump(tuned_model_pipeline,open(filename,'wb'))