In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import math
import gc
import random
import time
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, MinMaxScaler    
import lightgbm as lgb
from collections import defaultdict
import joblib
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

scaler = StandardScaler()
minmax = MinMaxScaler()

train_file = "/kaggle/input/santander-pr/train.csv"
test_file = "/kaggle/input/santander-pr/test.csv"

targetcols = ["ind_ahor_fin_ult1","ind_aval_fin_ult1","ind_cco_fin_ult1","ind_cder_fin_ult1","ind_cno_fin_ult1","ind_ctju_fin_ult1","ind_ctma_fin_ult1",
              "ind_ctop_fin_ult1","ind_ctpp_fin_ult1","ind_deco_fin_ult1","ind_deme_fin_ult1","ind_dela_fin_ult1", "ind_ecue_fin_ult1","ind_fond_fin_ult1",
              "ind_hip_fin_ult1", "ind_plan_fin_ult1","ind_pres_fin_ult1","ind_reca_fin_ult1","ind_tjcr_fin_ult1","ind_valo_fin_ult1","ind_viv_fin_ult1",
              "ind_nomina_ult1","ind_nom_pens_ult1","ind_recibo_ult1"]

dtype_list = {'ind_cco_fin_ult1': 'uint8',
              'ind_deme_fin_ult1': 'uint8',
              'ind_aval_fin_ult1': 'uint8',
              'ind_valo_fin_ult1': 'uint8',
              'ind_reca_fin_ult1': 'uint8',
              'ind_ctju_fin_ult1': 'uint8',
              'ind_cder_fin_ult1': 'uint8', 
              'ind_plan_fin_ult1': 'uint8',
              'ind_fond_fin_ult1': 'uint8', 
              'ind_hip_fin_ult1': 'uint8',
              'ind_pres_fin_ult1': 'uint8', 
              'ind_nomina_ult1': 'float64', 
              'ind_cno_fin_ult1': 'uint8',
              'ind_ctpp_fin_ult1': 'uint8',
              'ind_ahor_fin_ult1': 'uint8',
              'ind_dela_fin_ult1': 'uint8',
              'ind_ecue_fin_ult1': 'uint8',
              'ind_nom_pens_ult1': 'float64',
              'ind_recibo_ult1': 'uint8',
              'ind_deco_fin_ult1': 'uint8',
              'ind_tjcr_fin_ult1': 'uint8', 
              'ind_ctop_fin_ult1': 'uint8',
              'ind_viv_fin_ult1': 'uint8',
              'ind_ctma_fin_ult1': 'uint8',
             'ncodpers' : 'uint32'}  

feature_cols = ['ncodpers','fecha_dato','age','renta','nomprov', 'ind_nuevo', 
               'segmento', 'ind_actividad_cliente', 'pais_residencia', 'ind_empleado', 
                'sexo', 'tiprel_1mes', 'indrel_1mes', 'antiguedad',  'indrel', 'indext', 'indresi', 'indfall', 'canal_entrada']

## Modification Functions
### Helper Function

In [None]:
def string_num_age(x):
    if(type(x) == str and x != ' NA'):
        x = int(x)
    elif( x == ' NA'):
        x = np.nan
    return x

def string_num_senior(x):
    if(type(x) == str and x != '     NA'):
        x = int(x)
    elif( x == '     NA'):
        x = np.nan
    return x

def string_num_primary(x):
    if(type(x) == str and x!= np.nan and x!='P'):
        x = float(x)
    elif(type(x) == float and math.isnan(x)==False):
        x = int(x)
    elif(x == 'P'):
        x = 2.5
    return x

def modify_age(train, test):
    print("Modifying...age")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.age = temp_train.age.apply(lambda x: string_num_age(x))
    temp_train = temp_train.loc[temp_train.age.isnull()==False]
    temp_train.age = np.where(temp_train.age < 14, 14, temp_train.age)
    temp_train.age = np.where(temp_train.age > 90, 90, temp_train.age)
    temp_test.age = np.where(temp_test.age < 14, 14, temp_test.age)
    temp_test.age = np.where(temp_test.age > 90, 90, temp_test.age)

    temp_train.age = minmax.fit_transform(np.array(temp_train.age).reshape(-1,1))
    temp_test.age = minmax.fit_transform(np.array(temp_test.age).reshape(-1,1))
    
    return temp_train, temp_test

def modify_renta(train, test):
    temp_train = train.copy()
    temp_test = test.copy()

    temp_train.age = temp_train.age.apply(lambda x: string_num_age(x))
    temp_train = temp_train.loc[temp_train.age.isnull()==False]
    
    temp_train.nomprov = temp_train.nomprov.fillna(temp_train.nomprov.mode()[0])
    temp_train.nomprov = temp_train.nomprov.apply(lambda x: nom_mod(x))

    print('Modifying train...renta')
    province = temp_train.nomprov.unique()
    median = np.zeros((len(province),1))
    for i in range(len(province)):
        median[i] = temp_train[(temp_train["nomprov"]==province[i])]['renta'].median()
    print('Train Medians found ->')
    print(median)

    for i in range(len(province)):
        temp_train.renta = np.where((temp_train.nomprov == province[i]) & (temp_train.renta.isnull()==True), median[i], temp_train.renta)

    del median
    
    temp_test.age = temp_test.age.apply(lambda x: string_num_age(x))
    temp_test = temp_test.loc[temp_test.age.isnull()==False]
    
    temp_test.nomprov = temp_test.nomprov.fillna(temp_test.nomprov.mode()[0])
    temp_test.nomprov = temp_test.nomprov.apply(lambda x: nom_mod(x))
    
    print('Modifying test...renta')
    province = temp_test.nomprov.unique()
    median = np.zeros((len(province),1))
    for i in range(len(province)):
        median[i] = temp_test[(temp_test["nomprov"]==province[i])]['renta'].median()
    print('Test Medians found ->')
    print(median)

    for i in range(len(province)):
        temp_test.renta = np.where((temp_test.nomprov == province[i]) & (temp_test.renta.isnull()==True), median[i], temp_test.renta)

    del median
    
    temp_train.renta = scaler.fit_transform(np.array(temp_train.loc[:,'renta']).reshape(-1,1))
    temp_test.renta = scaler.fit_transform(np.array(temp_test.loc[:,'renta']).reshape(-1,1))
    return temp_train, temp_test

def modify_segmento(train, test):
    print("Modifying....segmento")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.segmento = temp_train.segmento.fillna(temp_train.segmento.mode()[0])
    temp_test.segmento = temp_test.segmento.fillna(temp_test.segmento.mode()[0])
    return temp_train, temp_test

def modify_sexo(train, test):
    print("Modifying....sexo")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.sexo = temp_train.sexo.fillna(value=temp_train.sexo.mode()[0])
    temp_test.sexo = temp_test.sexo.fillna(value=temp_test.sexo.mode()[0])
    return temp_train, temp_test

def modify_antiguedad(train, test):
    print("Modifying....antiguedad")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.antiguedad = temp_train.antiguedad.apply(lambda x: string_num_senior(x))
    temp_test.antiguedad = temp_test.antiguedad.apply(lambda x: string_num_senior(x))
    
    temp_train.antiguedad = temp_train.antiguedad.fillna(value=-999999)
    temp_test.antiguedad = temp_test.antiguedad.fillna(value=-999999)
    temp_train.antiguedad = np.where(temp_train.antiguedad==-999999,-1,temp_train.antiguedad)
    temp_test.antiguedad = np.where(temp_test.antiguedad==-999999,-1,temp_test.antiguedad)
    temp_train.antiguedad = minmax.fit_transform(np.array(temp_train.loc[:,'antiguedad']).reshape(-1,1))
    temp_test.antiguedad = minmax.fit_transform(np.array(temp_test.loc[:,'antiguedad']).reshape(-1,1))
    temp_train.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
    temp_train.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='signed')
    temp_test.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='signed')
    temp_test.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='signed')
    return temp_train, temp_test

def modify_fecha_dato(train, test):
    print("Modifying....fecha_dato")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.age = temp_train.age.apply(lambda x: string_num_age(x))
    temp_test.age = temp_test.age.apply(lambda x: string_num_age(x))
    temp_test = temp_test.loc[temp_test.age.isnull()==False]
    temp_train = temp_train.loc[temp_train.age.isnull()==False]
    temp_train.fecha_dato = temp_train['fecha_dato'].apply(lambda x: 100*x.year + x.month)
    temp_test.fecha_dato = temp_test['fecha_dato'].apply(lambda x: 100*x.year + x.month)
    return temp_train, temp_test

def modify_fecha_alta(train, test):
    print("Modifying....fecha_alta")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.age = temp_train.age.apply(lambda x: string_num_age(x))
    temp_test.age = temp_test.age.apply(lambda x: string_num_age(x))
    temp_test = temp_test.loc[temp_test.age.isnull()==False]
    temp_train = temp_train.loc[temp_train.age.isnull()==False]
    temp_train.fecha_alta = temp_train['fecha_alta'].apply(lambda x: 100*x.year + x.month)
    temp_test.fecha_alta = temp_test['fecha_alta'].apply(lambda x: 100*x.year + x.month)
    return temp_train, temp_test

def modify_indrel_1mes(train, test):
    print("Modifying...indrel_1mes")
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.indrel_1mes = temp_train.indrel_1mes.apply(lambda x: string_num_primary(x))
    temp_test.indrel_1mes = temp_test.indrel_1mes.apply(lambda x: string_num_primary(x))
    temp_train.indrel_1mes = temp_train.indrel_1mes.fillna(temp_train.indrel_1mes.median())
    temp_test.indrel_1mes = temp_test.indrel_1mes.fillna(temp_test.indrel_1mes.median())
    return temp_train, temp_test

def pais_mod(x):
    pais = ['ES','FR','AR','DE','GB','US','CO','IT','RO','MX']
    if( x not in pais):
        x = 'Outside'
    return x
    
def modify_pais_residencia(train, test):
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.pais_residencia = temp_train.pais_residencia.apply(lambda x: pais_mod(x))
    temp_test.pais_residencia = temp_test.pais_residencia.apply(lambda x: pais_mod(x))
    return temp_train, temp_test

def canal_mod(x):
    canal = ['KHE','KAT','KFC','KHQ','KFA','KHK','KHM','KHD','KHN','KAS']
    if( x not in canal):
        x = 'UNK'
    return x
    
def modify_canal_entrada(train, test):
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.canal_entrada = temp_train.canal_entrada.fillna(temp_train.canal_entrada.mode()[0])
    temp_test.canal_entrada = temp_test.canal_entrada.fillna(temp_test.canal_entrada.mode()[0])
    temp_train.canal_entrada = temp_train.canal_entrada.apply(lambda x: canal_mod(x))
    temp_test.canal_entrada = temp_test.canal_entrada.apply(lambda x: canal_mod(x))
    return temp_train, temp_test

def nom_mod(x):
    nomprov = ['MADRID','BARCELONA','VALENCIA','SEVILLA','CORUÑA, A','MURCIA','MALAGA','ZARAGOZA','ALICANTE','CADIZ']
    if x not in nomprov:
        x = 'OTHER'
    return x

nom_dict = {
    'MADRID': 'M',
    'BARCELONA' : 'B',
    'VALENCIA' : 'V',
    'SEVILLA' : 'S',
    'CORUÑA, A' : 'C',
    'MURCIA' : 'M1',
    'MALAGA': 'M2',
    'ZARAGOZA' : 'Z',
    'ALICANTE' : 'A1',
    'CADIZ' : 'C1',
    'OTHER' : 'O'
}
def modify_nomprov(train, test):
    temp_train = train.copy()
    temp_test = test.copy()
    temp_train.nomprov = temp_train.nomprov.fillna(temp_train.nomprov.mode()[0])
    temp_test.nomprov = temp_test.nomprov.fillna(temp_test.nomprov.mode()[0])
    temp_train.nomprov = temp_train.nomprov.apply(lambda x: nom_mod(x))
    temp_test.nomprov = temp_test.nomprov.apply(lambda x: nom_mod(x))
    temp_test.nomprov = temp_test.nomprov.apply(lambda x: nom_dict[x])
    temp_train.nomprov = temp_train.nomprov.apply(lambda x: nom_dict[x])
    return temp_train, temp_test

### Memory management Code

In [None]:
def reduce_mem_usage(props, columns_now):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    for col in columns_now:
        print(col)
        if props[col].dtype != object:  # Exclude strings

            print("******************************")
            print("dtype before: ",props[col].dtype)

            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()

            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True


            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    

            else:
                props[col] = props[col].astype(np.float32)

        print("___MEMORY USAGE AFTER COMPLETION:___")
        mem_usg = props.memory_usage().sum() / 1024**2 
        print("Memory usage is: ",mem_usg," MB")
        print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

## Dataframe Creation

### Reading CSV

In [None]:
x_train = pd.read_csv(train_file, usecols=feature_cols, parse_dates=['fecha_dato'])
x_test = pd.read_csv(test_file, usecols=feature_cols, parse_dates=['fecha_dato'])
x_train.fecha_dato = x_train['fecha_dato'].apply(lambda x: 100*x.year + x.month)
x_test.fecha_dato = x_test['fecha_dato'].apply(lambda x: 100*x.year + x.month)

In [None]:
ids = []
x_train, x_test = modify_age(x_train, x_test)
col_to_drop = []
for idx,col in enumerate(x_train.columns):
    
    print("Reading...." + str(col))

    if col == 'age' or col == 'fecha_dato':
        continue

    elif col == "renta":
        x_train, x_test = modify_renta(x_train, x_test)
        print(col + "...Done!")

    elif col == "segmento":
        x_train, x_test = modify_segmento(x_train, x_test)
        print(col + "...Done!")

    elif col == 'sexo':
        x_train, x_test = modify_sexo(x_train, x_test)
        print(col + "...Done!")

    elif col == "ind_nuevo":
        print("Modifying...."+col)
        x_train.ind_nuevo = x_train.ind_nuevo.fillna(value=1)
        x_test.ind_nuevo = x_test.ind_nuevo.fillna(value=1)
        print(col + "...Done!")

    elif col == "antiguedad":
        x_train, x_test = modify_antiguedad(x_train, x_test)
        print(col + "...Done!")

    elif col == 'indrel':
        print("Modifying...."+col)
        x_train.indrel = x_train.indrel.fillna(value=1)
        x_test.indrel = x_test.indrel.fillna(value=1)
        print(col + "...Done!")

    elif col == 'tiprel_1mes':
        print("Modifying...."+col)
        x_train.tiprel_1mes = x_train.tiprel_1mes.fillna(x_train.tiprel_1mes.mode()[0])
        x_train.tiprel_1mes = np.where((x_train.tiprel_1mes=='N')|(x_train.tiprel_1mes=='R'), 'I',x_train.tiprel_1mes)
        x_test.tiprel_1mes = x_test.tiprel_1mes.fillna(x_test.tiprel_1mes.mode()[0])
        x_test.tiprel_1mes = np.where((x_test.tiprel_1mes=='N')|(x_test.tiprel_1mes=='R'), 'I',x_test.tiprel_1mes)
        print(col + "...Done!")

    elif col == 'indext':
        print("Modifying...."+col)
        x_train.indext = x_train.indext.fillna(value='U')
        x_test.indext = x_test.indext.fillna(value='U')
        print(col + "...Done!")

    elif col == "ind_actividad_cliente":
        print("modifying..."+col)
        print(col + "...Done!")

    elif col== 'ncodpers':
        print("Modifying...."+col)
        ids = x_test.ncodpers.unique()
        print(col + "...Done!")

    elif col == "nomprov":
        print("Modifying...."+ col)
        x_train, x_test = modify_nomprov(x_train, x_test)
        print(col + "...Done!")

    elif col == 'fecha_alta':
        x_train, x_test = modify_fecha_alta(x_train, x_test)
        print(col + "...Done!")

    elif col == 'pais_residencia':
        print("Modifying...."+ col)
        x_train, x_test = modify_pais_residencia(x_train, x_test)
        print(col + "...Done!")

    elif col == 'canal_entrada':
        print("Modifying...."+col)
        x_train, x_test = modify_canal_entrada(x_train, x_test)
        print(col + "...Done!")

    elif col == 'indrel_1mes':
        x_train, x_test = modify_indrel_1mes(x_train, x_test)
        print(col + "...Done!")

    else: 
        print("Modifying...."+ col)
        print(col + "...Done!")

    '''''''''Null values filled'''''''''''
    columns_now = []

    if x_train[col].dtype == 'object':
        x_train[col] = x_train[col].fillna(x_train[col].mode()[0])
        cat_enc_train = pd.get_dummies(x_train[col], prefix=col)
        cat_enc_test = pd.get_dummies(x_test[col], prefix=col)
        for i in cat_enc_train.columns.to_list():
            columns_now.append(i)
        x_train = pd.concat([x_train, cat_enc_train], axis=1)
        x_test = pd.concat([x_test, cat_enc_test], axis=1)
        col_to_drop.append(col)
    
    else:
        if(col != 'fecha_dato' and col!= 'fecha_alta' and col!='ncodpers'):
            columns_now.append(col)
        continue
        
    del cat_enc_train, cat_enc_test
    x_train.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
    x_train.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='unsigned')
    x_test.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
    x_test.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='unsigned')
    x_train = reduce_mem_usage(x_train, columns_now)
    print("Train Mem reduction...Done!")
    x_test = reduce_mem_usage(x_test, columns_now)
    print("Test Mem reduction...Done!")

for i in col_to_drop:
    x_train.drop(columns=[i], inplace=True)
    x_test.drop(columns=[i], inplace=True)
    
print(x_train.shape)
print(x_test.shape)

In [None]:
y_train = pd.read_csv(train_file, usecols = ['ncodpers','age','fecha_dato','ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
       'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
       'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
       'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
       'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
       'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
       'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'], dtype=dtype_list, parse_dates=['fecha_dato'])
y_train.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
y_train.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='unsigned')

# Selection of rows
y_train.age = y_train.age.apply(lambda x: string_num_age(x))
y_train = y_train.loc[y_train.age.isnull()==False]
y_train.fecha_dato = y_train['fecha_dato'].apply(lambda x: 100*x.year + x.month)
y_train = y_train.fillna(0)
y_train.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
y_train.select_dtypes(include=['float']).apply(pd.to_numeric,downcast='unsigned')

y_train = reduce_mem_usage(y_train,y_train.columns)

### Lags

In [None]:
def create_lags(lag, date, x_train, df_name):
    
    for i in lag:
        if(i==0):
            break
        rename_dict = {}
        col_names = []
        for j in targetcols:
            name = j + '_lag_' + str(i)
            rename_dict[j] = name
            col_names.append(name)
        df = pd.DataFrame()
        for j in date:
            cur = j-i
            if(cur <= 201500):
                dum = x_train[x_train.fecha_dato == j]
                df_lag = y_train[y_train.fecha_dato==j]
                df_lag = df_lag.rename(columns=rename_dict)
                df_lag.drop(columns=['fecha_dato','age'],inplace=True)
                for k in col_names:
                    df_lag[k] = 0
                dum = dum.merge(df_lag, on=['ncodpers'], how='left')
                df = pd.concat([df,dum], axis=0)
                del dum
            else:
                if((j > 201600) and cur not in range(201501, 201512) and cur not in range(201601, 201605)):
                    cur = 201512 - (201600-cur)
                df_lag = y_train[y_train.fecha_dato==cur]
                df_lag = df_lag.rename(columns=rename_dict)
                df_lag.drop(columns=['fecha_dato','age'],inplace=True)
                dum = x_train[x_train.fecha_dato == j]
                dum = dum.merge(df_lag, on=['ncodpers'], how='left')
                df = pd.concat([df,dum], axis=0)
                print("1_>"+str(j)+"->"+str(dum.shape))
                del dum
        x_train = df
        del df
        print("1--->"+str(x_train.shape))
        print('Lag '+str(i)+' for ' + df_name +'...Done!!')
    x_train.fillna(0, inplace=True)
    return x_train

## Two way split timeframe

In [None]:
lags_1 = [1,2,3,4]
date_1 = [201503,201504,201505,201506,201507]
lags_2 = [1,2,3,4,5]
date_2 = [201604]

x_train_1 = create_lags(lags_1, date_1, x_train, 'x_train_1')
x_train_1= reduce_mem_usage(x_train_1, x_train_1.columns)
x_test_1 = create_lags(lags_1, [201605], x_test, 'x_test_1')
x_test_1=reduce_mem_usage(x_test_1, x_test_1.columns)
y_train_1 = y_train[((y_train.fecha_dato>=201503) & (y_train.fecha_dato<=201507))]
y_train_1=reduce_mem_usage(y_train_1, y_train_1.columns) 

x_train_2 = create_lags(lags_2, date_2, x_train, 'x_train_2')
x_train_2=reduce_mem_usage(x_train_2, x_train_2.columns)
x_test_2 = create_lags(lags_2, [201605], x_test, 'x_test_2')
x_test_2=reduce_mem_usage(x_test_2, x_test_2.columns)
y_train_2 = y_train[((y_train.fecha_dato==201604))]
y_train_2=reduce_mem_usage(y_train_2, y_train_2.columns)

In [None]:
recent_prod = y_train[y_train.fecha_dato==201604]

del y_train
del x_train, x_test

recent_prod.drop(columns=['fecha_dato'], inplace=True)
recent_prod = reduce_mem_usage(recent_prod, recent_prod.columns)

product_col = recent_prod.columns.tolist()
for i in ['ncodpers','age']:
    product_col.remove(i)
    
ids = x_test_1['ncodpers'].values

## Model 
### Weighted Average LGBM 

In [None]:
from collections import defaultdict
import joblib

id_preds = defaultdict(list)
ids = x_test_1['ncodpers'].values

First models for feature selection

In [None]:
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'num_leaves': 64,
          'learning_rate': 0.1,
          'num_iterations': 200,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error',
         'verbosity' : 1}

id_preds = defaultdict(list)
combined_2015 = {}

for c in product_col:
    print(c)
    print(c+"-first")
    y_t_1 = y_train_1[c]
    x_t_1 = x_train_1.drop(['fecha_dato','ncodpers'],1)
    model_1 = lgb.LGBMClassifier(
        boosting_type= 'gbdt',
        objective = 'binary',
        max_depth = params['max_depth'],
        max_bin = params['max_bin'],
        subsample = params['subsample'],
        subsample_freq = params['subsample_freq'],
        min_split_gain = params['min_split_gain'],
        min_child_weight = params['min_child_weight'],
        min_child_samples = params['min_child_samples'],
        scale_pos_weight = params['scale_pos_weight'],
        learning_rate=params['learning_rate'],
        num_iterations=params['num_iterations'],
        verbosity = params['verbosity']
    )
    
    model_1.fit(x_t_1,y_t_1)
    x_t2_1 = x_test_1.drop(['fecha_dato','ncodpers'],1)
    prediction_1 = model_1.predict_proba(x_t2_1)[:,1]
    combined_2015[c] = model_1
    del x_t_1, y_t_1, x_t2_1, model_1

joblib.dump(combined_2015,'./combined2015.pkl')

combined_2016 = {}
for c in product_col:
    print(c+"-second")
    y_t_2 = y_train_2[c]
    x_t_2 = x_train_2.drop(['fecha_dato','ncodpers'],1)
    model_2 = lgb.LGBMClassifier(
        boosting_type= 'gbdt',
        objective = 'binary',
        max_depth = params['max_depth'],
        max_bin = params['max_bin'],
        subsample = params['subsample'],
        subsample_freq = params['subsample_freq'],
        min_split_gain = params['min_split_gain'],
        min_child_weight = params['min_child_weight'],
        min_child_samples = params['min_child_samples'],
        scale_pos_weight = params['scale_pos_weight'],
        learning_rate=params['learning_rate'],
        num_iterations=params['num_iterations'],
        verbosity = params['verbosity']
    )
    
    model_2.fit(x_t_2,y_t_2)
    x_t2_2 = x_test_2.drop(['fecha_dato','ncodpers'],1)
    prediction_2 = model_2.predict_proba(x_t2_2)[:,1]
    combined_2016[c] = model_2
    del x_t_2, y_t_2, x_t2_2, model_2
    

joblib.dump(combined_2016,'./combined2016.pkl')

In [None]:
model_15 = combined_2015
model_16 = combined_2016

### Feature Selection on Importance

In [None]:
a = [0 for i in range(0,155)]
for i in product_col:
    a+= model_15[i].feature_importances_
print(len(a))
print(len(x_train_1.drop(columns=['fecha_dato','ncodpers']).columns))
feat_imp_1 = {}
for i in zip(x_train_1.drop(columns=['fecha_dato','ncodpers']).columns,a):
    feat_imp_1[i[0]] = i[1]

feat_imp_1 = sorted(feat_imp_1.items(), key = lambda x: x[1], reverse=False)
for i in feat_imp_1:
    print(i)

a = [0 for i in range(len(x_train_2.drop(columns=['fecha_dato','ncodpers']).columns))]
for i in product_col:
    a+= model_16[i].feature_importances_
print(len(a))
print(len(x_train_2.drop(columns=['fecha_dato','ncodpers']).columns))
feat_imp_2 = {}
for i in zip(x_train_2.drop(columns=['fecha_dato','ncodpers']).columns,a):
    feat_imp_2[i[0]] = i[1]

feat_imp_2 = sorted(feat_imp_2.items(), key = lambda x: x[1], reverse=False)
for i in feat_imp_2:
    print(i)

In [None]:
drop_2015, drop_2016 = [],[]
for i in feat_imp_1:
    if(i[1]<5):
        drop_2015.append(i[0])

for i in feat_imp_2:
    if(i[1]<5):
        drop_2016.append(i[0])

Second model post feature selection

In [None]:
import lightgbm as lgb
from collections import defaultdict

params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'num_leaves': 64,
          'learning_rate': 0.1,
          'num_iterations': 200,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error',
         'verbosity' : 1}

id_preds = defaultdict(list)
ids = x_test_1['ncodpers'].values

new_x_train_1 = x_train_1.drop(columns=drop_2015)
new_x_test_1 = x_test_1.drop(columns=drop_2015)
del x_train_1, x_test_1

predictions_2015 = {}
models_2015 = {}
for c in product_col:
    print(c+"-first")
    y_t_1 = y_train_1[c]
    x_t_1 = new_x_train_1.drop(['fecha_dato','ncodpers'],1)
    model_1 = lgb.LGBMClassifier(
        boosting_type= 'gbdt',
        objective = 'binary',
        max_depth = params['max_depth'],
        max_bin = params['max_bin'],
        subsample = params['subsample'],
        subsample_freq = params['subsample_freq'],
        min_split_gain = params['min_split_gain'],
        min_child_weight = params['min_child_weight'],
        min_child_samples = params['min_child_samples'],
        scale_pos_weight = params['scale_pos_weight'],
        learning_rate=params['learning_rate'],
        num_iterations=params['num_iterations'],
        verbosity = params['verbosity']
    )
    
    model_1.fit(x_t_1,y_t_1)
    x_t2_1 = new_x_test_1.drop(['fecha_dato','ncodpers'],1)
    prediction_1 = model_1.predict_proba(x_t2_1)[:,1]
    models_2015[c] = model_1
    del x_t_1, y_t_1, x_t2_1, model_1
    predictions_2015[c] = prediction_1
    

joblib.dump(models_2015,'./Newcombined2015.pkl')

new_x_train_2 = x_train_2.drop(columns=drop_2016)
new_x_test_2 = x_test_2.drop(columns=drop_2016)
del x_train_2, x_test_2

predictions_2016 = {}
models_2016 = {}
for c in product_col:
    print(c+"-second")
    y_t_2 = y_train_2[c]
    x_t_2 = new_x_train_2.drop(['fecha_dato','ncodpers'],1)
    model_2 = lgb.LGBMClassifier(
        boosting_type= 'gbdt',
        objective = 'binary',
        max_depth = params['max_depth'],
        max_bin = params['max_bin'],
        subsample = params['subsample'],
        subsample_freq = params['subsample_freq'],
        min_split_gain = params['min_split_gain'],
        min_child_weight = params['min_child_weight'],
        min_child_samples = params['min_child_samples'],
        scale_pos_weight = params['scale_pos_weight'],
        learning_rate=params['learning_rate'],
        num_iterations=params['num_iterations'],
        verbosity = params['verbosity']
    )
    
    model_2.fit(x_t_2,y_t_2)
    x_t2_2 = new_x_test_2.drop(['fecha_dato','ncodpers'],1)
    prediction_2 = model_2.predict_proba(x_t2_2)[:,1]
    models_2016[c] = model_2
    del x_t_2, y_t_2, x_t2_2, model_2
    predictions_2016[c] = prediction_2
    

joblib.dump(models_2016,'./Newcombined2016.pkl')

Weighted Average

In [None]:
from collections import defaultdict
id_preds = defaultdict(list)

for c in product_col:
    print(c)
    prediction = predictions_2015[c]*0.2 + predictions_2016[c]*0.8
    for id, p in zip(ids, prediction):
        id_preds[id].append(p)


## Final Prediction Selection

In [None]:
from tqdm import tqdm
train_preds = {}

for id, p in tqdm(id_preds.items(), desc='Loading....'):
    try:
        recent = recent_prod[recent_prod.ncodpers==id].iloc[0]
        preds = {}
        for i in zip(tuple(product_col),p):
            if(recent[i[0]] == 1):
                preds[i[0]] = 1 - i[1]
            else:
                preds[i[0]] = i[1]

        temp_fin = sorted(preds.items(), key = lambda x: x[1], reverse=True)[:5]  #Dict
        preds_fin = []
        for i in temp_fin:
            preds_fin.append(i[0])
        train_preds[id] = preds_fin
    except:
        for i in zip(tuple(product_col),p):
            preds[i[0]] = i[1]
        temp_fin = sorted(preds.items(), key = lambda x: x[1], reverse=True)[:5]
        preds_fin = []
        for i in temp_fin:
            preds_fin.append(i[0])
        train_preds[id] = preds_fin

df = {
    'ncodpers': [],
    'changed' : []
}
for i in train_preds:
    df['ncodpers'].append(i)
    prods = ''
    for j in train_preds[i]:
        prods += " " + j
    df['changed'].append(prods)

print(df)
final_df = pd.DataFrame(df, columns = ['ncodpers','changed'])
final_df.to_csv('/kaggle/working/lgbm_sub1.csv', index=False)