In [None]:
# importing the all the necessary libraries
import pandas as pd
import numpy as np
from itertools import combinations

from sklearn.preprocessing import LabelEncoder 

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

import pickle 
from sklearn.externals import joblib 

import warnings
warnings.filterwarnings("ignore")

 FINAL PIPE LINE FOR XGB CLASSIFIER ALGORITHM

In [2]:
#MISSING VALUE IMPUTATION
def missing_value(df):
    df['Health Indicator'].fillna("X0", inplace=True)
    df['Holding_Policy_Type'].fillna(0, inplace=True)
    df['Holding_Policy_Duration'].fillna("not", inplace=True)
    return df

In [3]:
def frequency_encoding(column_name,output_column_name,df):
    """ Frequency enoding for each categories in categorical features """
    fe_pol = (df.groupby(column_name).size()) / len(df)
    df[output_column_name] = df[column_name].apply(lambda x : fe_pol[x])

In [5]:
def feature_engineering(df):
    cat_features=[]
    
    #Interaction Feature (Combining 2 categorical features and performing frequency encoding)
    
    columns=['City_Code','Accomodation_Type','Reco_Insurance_Type','Health Indicator',
             'Is_Spouse','Region_Code','Holding_Policy_Type','Reco_Policy_Cat']

    comb = combinations(columns, 2) 

    for i in list(comb):  
        df[f'{i[0]}_{i[1]}']=df[i[0]].astype(str)+'_'+df[i[1]].astype(str)
        frequency_encoding(f'{i[0]}_{i[1]}',f'{i[0]}_{i[1]}',df)
        cat_features.append(f'{i[0]}_{i[1]}')
    
    #Frequency Encoding
    
    frequency_encoding('City_Code','City_Code_fe',df)
    frequency_encoding('Region_Code','Region_Code_fe',df)
    frequency_encoding('Holding_Policy_Duration','Holding_Policy_Duration',df)
    frequency_encoding('Holding_Policy_Type','Holding_Policy_Type_fe',df)
    frequency_encoding('Health Indicator','Health_Indicator_fe',df)
    
    #Deriving characteristics of each city by creating aggregate features
    
    city_aggregate_features = df.groupby(['City_Code']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'], 
                                                     'Region_Code': ['nunique','count'], 
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'Holding_Policy_Type': ['nunique','count'] ,
                                                     'Reco_Policy_Cat': ['nunique','count'] ,
                                                     })
    city_aggregate_features.columns = ['city_aggregate_features' + '_'.join(c).strip('_') for c in city_aggregate_features.columns]
    df = pd.merge(df, city_aggregate_features, on = ['City_Code'], how='left')

    #Deriving characteristics of each city and region combined by creating aggregate features
    
    city_region_aggregate_features = df.groupby(['City_Code','Region_Code']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'],  
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'Holding_Policy_Type': ['nunique','count'] ,
                                                     'Reco_Policy_Cat': ['nunique','count'] ,
                                                     })
    city_region_aggregate_features.columns = ['city_region_aggregate_features' + '_'.join(c).strip('_') for c in city_region_aggregate_features.columns]
    df = pd.merge(df, city_region_aggregate_features, on = ['City_Code','Region_Code'], how='left')

     #Deriving characteristics of each city and Recommended policy category combined by creating aggregate features
        
    city_recopolicycat_aggregate_features = df.groupby(['City_Code','Reco_Policy_Cat']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'], 
                                                     'Region_Code': ['nunique','count'], 
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'Holding_Policy_Type': ['nunique','count'] 
                                                     })
    city_recopolicycat_aggregate_features.columns = ['city_recopolicycat_aggregate_features' + '_'.join(c).strip('_') for c in city_recopolicycat_aggregate_features.columns]
    df = pd.merge(df, city_recopolicycat_aggregate_features, on = ['City_Code','Reco_Policy_Cat'], how='left')
    
     #Deriving characteristics of each city and Region_Code_Reco_Policy_Cat(interaction feature) combined by creating aggregate features
        
    city_regioncoderecopolicycat_aggregate_features = df.groupby(['City_Code','Region_Code_Reco_Policy_Cat']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'], 
                                                     'Region_Code': ['nunique','count'], 
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'Holding_Policy_Type': ['nunique','count'] ,
                                                     'Reco_Policy_Cat': ['nunique','count'] ,
                                                     })

    city_regioncoderecopolicycat_aggregate_features.columns = ['city_regioncoderecopolicycat_aggregate_features' + '_'.join(c).strip('_') for c in city_regioncoderecopolicycat_aggregate_features.columns]
    df = pd.merge(df, city_regioncoderecopolicycat_aggregate_features, on = ['City_Code','Region_Code_Reco_Policy_Cat'], how='left')
    
    #Deriving characteristics of Holding_Policy_Type by creating aggregate features
    
    holdingpolicytype_aggregate_features = df.groupby(['Holding_Policy_Type']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'], 
                                                     'Region_Code': ['nunique','count'], 
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'City_Code': ['nunique','count'] ,
                                                     })
    holdingpolicytype_aggregate_features.columns = ['holdingpolicytype_aggregate_features' + '_'.join(c).strip('_') for c in holdingpolicytype_aggregate_features.columns]
    df = pd.merge(df, holdingpolicytype_aggregate_features, on = ['Holding_Policy_Type'], how='left')
    
    #Deriving characteristics of Health Indicator by creating aggregate features
    
    Health_Indicator_aggregate_features = df.groupby(['Health Indicator']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'],  
                                                     'Region_Code': ['nunique','count'], 
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Holding_Policy_Type': ['nunique','count'] ,
                                                     'City_Code': ['nunique','count'] ,
                                                     })
    Health_Indicator_aggregate_features.columns = ['Health_Indicator_aggregate_features' + '_'.join(c).strip('_') for c in Health_Indicator_aggregate_features.columns]
    df = pd.merge(df, Health_Indicator_aggregate_features, on = ['Health Indicator'], how='left')
    
    for i in cat_features:
        df[f'city_{i}_max']=df.groupby('City_Code')[i].transform('max')
        df[f'city_{i}_min']=df.groupby('City_Code')[i].transform('min')
        df[f'city_{i}_mean']=df.groupby('City_Code')[i].transform('mean')
        df[f'city_{i}_std']=df.groupby('City_Code')[i].transform('std')

    
        df[f'city_region_{i}_max']=df.groupby(['City_Code','Region_Code'])[i].transform('max')
        df[f'city_region_{i}_min']=df.groupby(['City_Code','Region_Code'])[i].transform('min')
        df[f'city_region_{i}_mean']=df.groupby(['City_Code','Region_Code'])[i].transform('mean')
        df[f'city_region_{i}_std']=df.groupby(['City_Code','Region_Code'])[i].transform('std')

    
        df[f'city_recopolicycat_{i}_max']=df.groupby(['City_Code','Reco_Policy_Cat'])[i].transform('max')
        df[f'city_recopolicycat_{i}_min']=df.groupby(['City_Code','Reco_Policy_Cat'])[i].transform('min')
        df[f'city_recopolicycat_{i}_mean']=df.groupby(['City_Code','Reco_Policy_Cat'])[i].transform('mean')
        df[f'city_recopolicycat_{i}_std']=df.groupby(['City_Code','Reco_Policy_Cat'])[i].transform('std')
        
    return df


In [7]:
def label_encoder(df):
    """ Label encoding for categorical features"""
    le = LabelEncoder()
    for col in ['City_Code','Accomodation_Type','Reco_Insurance_Type','Health Indicator','Is_Spouse','Holding_Policy_Duration','Holding_Policy_Type','Reco_Policy_Cat']:
      df[col]=  df[col].astype('str') 
      df[col]= le.fit_transform(df[col])

    return df   

In [8]:
def final_pipe_line(df):
    """FINAL PIPE LINE"""
    df_final=missing_value(df)
    df_final=feature_engineering(df)
    df_final=label_encoder(df)
    return df_final

In [10]:
test_data_set=pd.read_csv("/content/test_YCcRUnU.csv")
test_data_set.drop(['ID'],axis=1,inplace=True)

In [11]:
test_data_set=final_pipe_line(test_data_set)

In [None]:
model=joblib.load('xgb.pkl')#loading the pretrained model
submission=model.predict_proba(test_data_set)