In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline

from collections import Counter
import pickle
import json
import warnings
warnings.filterwarnings("ignore")

class Default_Prediction:
    def __init__(self):
        self.discard_col = [
            'decision_id',
            'label',
            'device_name',
            'gms_version',
            'brand',
            'carrier',
            'manufacturer',
            'screen_height', 
            'v11', 
            'v12', 
            'v19', 
            'v2', 
            'v21', 
            'v35', 
            'v6',
        ]
        self.cat_col = [
            'screen_dpi',
            'network_type'
        ]
        
        self.num_col = []
        
    def remove_null_row(self, null_count_threshold = 40):
        null_row_index = list(self.df[self.df.isna().sum(axis=1)>=null_count_threshold].index)
        self.df.drop(null_row_index, axis=0, inplace=True)
    
    def add_null_flag_column(self, null_flag_cols = ['v27','v1','v44','v45']):
        for col in null_flag_cols:
            self.df[col+'_nflag'] = self.df[col].apply(lambda x: np.where(np.isnan(x), 1, 0))
    
    def data_impute(self, impute_dict = {}):
        self.impute_dict = impute_dict
        if(len(list(self.impute_dict.keys()))==0):
            for col in self.df.columns:
                if col in self.cat_col:
                    self.impute_dict[col] = self.df[col].mode()[0]
                else:
                    self.impute_dict[col] = self.df[col].median()
        for col in self.df.columns:
            impute_val = self.impute_dict[col]
            self.df[col].fillna(impute_val, inplace = True)

    
    def remove_outliers(self):
        outlier_indices = []
        self.num_col = list(set(self.df.columns)-set(self.cat_col))
        #iterate over columns of features
        for c in self.num_col:
            # 1st quartile
            Q1 = np.percentile(self.df[c],25)
            # 3rd quartile
            Q3 = np.percentile(self.df[c],75)
            # IQR
            IQR = Q3 - Q1

            # Outlier step
            outlier_step = IQR * 1.5
            # detect outlier and their indeces( a list of indeces of outliers for a feature columns)   
            outlier_list_col = self.df[(self.df[c] < Q1 - outlier_step) | (self.df[c] > Q3 + outlier_step)].index
            # store indeces (append the outlier indices that we found  for column to the list of outlier indices )
            outlier_indices.extend(outlier_list_col)
        # select observation containing more than outliers
        outlier_indices = Counter(outlier_indices)
        multiple_outliers = list(i for i, v in outlier_indices.items() if v > 20)
        self.df.drop(multiple_outliers, axis = 0, inplace = True)
    

    def get_data_encoding(self, code_run_mode = 'Train'):
        self.df = pd.get_dummies(self.df, columns=self.cat_col, dummy_na=False)
        if(code_run_mode == 'Train'):
            self.model_cols = list(self.df.columns)
            # save self.model_cols
        else:
            #some values in categorical data may not be present in test/prediction data
            enc_cols = list(self.df.columns)
            for col in set(self.model_cols)-set(enc_cols):
                self.df[col] = 0
            
            # some new class/values may be present in test/prediction data
            self.df.drop(list(set(enc_cols)-set(self.model_cols)), axis = 1, inplace = True)
    
    def data_preprocessing(self, code_run_mode = 'Train'):
        print('0 data preprocessing starts', self.df.shape)
        
        self.df.drop(self.discard_col, axis = 1, inplace = True)
        print("1 dropping unwanted column", self.df.shape)       
        
        self.add_null_flag_column()
        print("2 add null columns", self.df.shape)
        
        for col in self.cat_col:
            self.df[col] = self.df[col].str.lower()
        print("3 lower-case cat columns", self.df.shape)
        
        if(code_run_mode == 'Train'):
            self.remove_null_row()
            print("4 dropping null rows", self.df.shape)
            
            self.data_impute()
            print("5 data_imputation", self.df.shape)
            #save impute_dict
            self.remove_outliers()
            print("6 remove outliers", self.df.shape)
            
        else:
            self.data_impute(self.impute_dict)
            print("5 data imputation", self.df.shape)
            
        self.get_data_encoding(code_run_mode)
        print("7 data_encoding", self.df.shape)
        
    def get_model_hyper_param(self):
        param = {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
        return param
    
    def save_model_artifact(self):
        model_artifact = {}
        model_artifact['impute_dict'] = self.impute_dict
        model_artifact['model_cols'] = list(self.model_cols)
        #model_artifact['model_param'] = self.model_param
        with open("model_artifact.json", "w") as outfile:
            json.dump(model_artifact, outfile)

    def load_model_artifact(self):
        with open('model_artifact.json', 'r') as openfile:
            model_artifact = json.load(openfile)
        
        self.impute_dict = model_artifact['impute_dict']
        self.model_cols = model_artifact['model_cols']
        #self.model_param = model_artifact('model_param')

        
    def model_training(self, df):
        self.df = df
        print('0 model trainaing begins', self.df.shape)
        self.data_preprocessing(code_run_mode = 'Train')
        
        X = self.df.drop(['default'], axis = 1)
        y = self.df['default']
        
        model = LogisticRegression()
        param = self.get_model_hyper_param()
        model.set_params(**param)
        
        
        feature_scale = MinMaxScaler()
        self.pipeline = Pipeline(steps = [('feature-scaling', feature_scale), ('model', model)])
        self.pipeline = self.pipeline.fit(X, y)
        
        self.save_model_artifact()
        pickle.dump(self.pipeline, open('model.sav', 'wb'))
        
        
    def model_prediction(self, df):
        self.df = df
        self.load_model_artifact()
        self.data_preprocessing(code_run_mode = 'Test')
        self.pipeline = pickle.load(open('model.sav', 'rb'))
        
        X_pred = self.df[self.model_cols].drop('default', axis = 1)
        y_pred = self.pipeline.predict(X_pred)
        return y_pred
        



#### Data Loading

In [2]:
df = pd.read_csv('dataset - v2.csv')
df_tr = df.loc[df.label=='modeling'] # for crossvalidation and hyperparameter tuning
df_test = df.loc[df.label=='oot']
model_obj = Default_Prediction()


#### Training

In [3]:
model_obj.model_training(df_tr.copy())

0 model trainaing begins (4661, 62)
0 data preprocessing starts (4661, 62)
1 dropping unwanted column (4661, 47)
2 add null columns (4661, 51)
3 lower-case cat columns (4661, 51)
4 dropping null rows (4510, 51)
5 data_imputation (4510, 51)
6 remove outliers (4487, 51)
7 data_encoding (4487, 60)


#### Prediction

In [4]:
df_test['yhat'] = model_obj.model_prediction(df_test.copy())
output = df_test[['decision_id', 'yhat']]
output.to_excel('output.xlsx', index = False)

0 data preprocessing starts (1585, 62)
1 dropping unwanted column (1585, 47)
2 add null columns (1585, 51)
3 lower-case cat columns (1585, 51)
5 data imputation (1585, 51)
7 data_encoding (1585, 60)
