## Imports

In [1]:
import pandas as pd
pd.options.display.max_columns = 999
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold

from IPython.display import clear_output
import time
start = time.time()

## Data and parameters

In [2]:
DATA_PATH = 'E:/Projects/Datacup/'
seed = 32
np.random.seed(seed=seed)

In [3]:
train = pd.read_csv(DATA_PATH+'performance_train.csv')
test = pd.read_csv(DATA_PATH+'performance_test.csv')
train_fac = pd.read_csv(DATA_PATH + 'facturation_train.csv')
test_fac = pd.read_csv(DATA_PATH + 'facturation_test.csv')

compl = pd.concat([train,test],axis=0,ignore_index=True)
compl.loc[:len(train),'trainset']=1
compl.loc[len(train):,'trainset']=0

compl_fac = pd.concat([train_fac,test_fac],axis=0).sort_values(['ID_CPTE','PERIODID_MY'])

In [4]:
compl_fac['ratio_ctotal_credit'] = compl_fac['CurrentTotalBalance']/compl_fac['CreditLimit']

In [5]:
compl_fac.fillna(0,inplace=True)

## Features

In [6]:
numericals = []

In [7]:
def get_lags_compte(data,var,lags,numericals):
    gp = data.groupby(['ID_CPTE'])
    for lag in lags:
        compl[var+str(lag)] = compl['ID_CPTE'].map(gp[var].nth(lag))
        if not var+str(lag) in numericals:
            numericals.append(var+str(lag))

In [8]:
get_lags_compte(compl_fac,'ratio_ctotal_credit',[-1],numericals)

In [9]:
target = 'Default'
features = numericals.copy()

tr_idx = np.where(compl.trainset==1)
test_idx = np.where(compl.trainset==0)
df_train = compl.iloc[tr_idx]
df_test = compl.iloc[test_idx]

X = df_train[features]
X_test = df_test[features]
y = df_train[target]

## Models

In [10]:
lr = LogisticRegression(C=0.01)

In [15]:
%%time

rocs = []
kf = KFold(10,shuffle=True,random_state=seed)
X_meta = np.zeros((compl.shape[0],2))
X_meta[:X.shape[0],0]=y
for train_idx, val_idx in kf.split(X):

    X_tr = X.iloc[train_idx]
    y_tr = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]

    lr.fit(X_tr,y_tr)
    
    predict_tr = lr.predict_proba(X_tr)[:,1]
    roc_auc_tr = roc_auc_score(y_tr,predict_tr)

    predict_val =  lr.predict_proba(X_val)[:,1]
    X_meta[val_idx,1]=predict_val
    roc_auc = roc_auc_score(y_val,predict_val)

    rocs.append(roc_auc)
    print('TRAIN:',roc_auc_tr,'VAL:',roc_auc)

clear_output()
avg_roc = sum(rocs)/len(rocs)
print('Average ROC:',avg_roc)

Average ROC: 0.850369654863474
Wall time: 81.2 ms


In [17]:
lgb_model = lr.fit(X,y)
start_pred = time.time()
predict_test = lr.predict_proba(X_test)[:,1]
print((time.time()-start_pred))
test['Default'] = predict_test
X_meta[X.shape[0]:,1]=predict_test
#test[['ID_CPTE','Default']].to_csv('SinglePredictions/'+f'{avg_roc:.5f}_'+'LR_full.csv',index=False)
#np.save('../Stacksfull/'+f'{avg_roc:.5f}_'+'MetaLR_full.npy',X_meta)

0.0


In [13]:
print((time.time()-start))

0.6727902889251709
