# Import library

In [None]:
% matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
seed =45

# Import Dataset

In [None]:
path = '../input/'
train = pd.read_csv(path+'train.csv',na_values=-1)
test = pd.read_csv(path+'test.csv',na_values=-1)
print('Number rows and columns:',train.shape)
print('Number rows and columns:',test.shape)

# Explore Dataset

In [None]:
train.head(3).T

# Target Varaiable

In [None]:
plt.figure(figsize=(10,3))
sns.countplot(train['target'],palette='rainbow')
plt.xlabel('Target')

train['target'].value_counts()

# Correlation Plot

In [None]:
cor = train.corr()
plt.figure(figsize=(16,10))
sns.heatmap(cor)

#  "ps_calc" value has no relation with remaining variable

In [None]:
ps_cal = train.columns[train.columns.str.startswith('ps_calc')] 
train = train.drop(ps_cal,axis =1)
test = test.drop(ps_cal,axis=1)
train.shape

In [None]:
train['target'] = train['target'].astype('category')

# Missing value in Dataset

In [None]:
k= pd.DataFrame()
k['train']= train.isnull().sum()
k['test'] = test.isnull().sum()
k

In [None]:
def missing_value(df):
    col = df.columns
    for i in col:
        if df[i].isnull().sum()>0:
            df[i].fillna(df[i].mode()[0],inplace=True)

In [None]:
missing_value(train)
missing_value(test)

# Univariate Analysis

In [None]:
cat_col = [col for col in train.columns if '_cat' in col]
print(cat_col)

In [None]:
for c in cat_col:
    train[c] = train[c].astype('uint8')
    test[c] = test[c].astype('uint8') 

In [None]:
bin_col = [col for col in train.columns if 'bin' in col]
print(bin_col)

In [None]:
for c in bin_col:
    train[c] = train[c].astype('uint8')
    test[c] = test[c].astype('uint8') 

In [None]:
def category_col(df):
    c_col = df.columns
    to_cat_col =[]
    for i in c_col:
        if df[i].nunique()<=104:
            to_cat_col.append(i)
    return to_cat_col


tot_cat_col = category_col(train)
other_cat_col = [c for c in tot_cat_col if c not in cat_col+ bin_col]
other_cat_col

In [None]:
ot_col = ['ps_ind_01','ps_ind_03','ps_ind_14','ps_ind_15','ps_car_11']
for c in ot_col:
    train[c] = train[c].astype('uint8')
    test[c] = test[c].astype('uint8') 

In [None]:
num_col = [c for c in train.columns if c not in tot_cat_col]
num_col.remove('id')
num_col

# Determine Outliers in Dataset

In [None]:
def outlier(df,columns):
    for i in columns:
        quartile_1,quartile_3 = np.percentile(df[i],[25,75])
        quartile_f,quartile_l = np.percentile(df[i],[1,99])
        IQR = quartile_3-quartile_1
        lower_bound = quartile_1 - (1.5*IQR)
        upper_bound = quartile_3 + (1.5*IQR)
        print(i,lower_bound,upper_bound,quartile_f,quartile_l)
        
        df[i].loc[df[i] < lower_bound] = quartile_f
        df[i].loc[df[i] > upper_bound] = quartile_l
        
num_col = ['ps_reg_03', 'ps_car_13', 'ps_car_14']
outlier(train,num_col)
outlier(test,num_col)

# Split Dataset

In [None]:
X = train.drop(['target','id'],axis=1)
y = train['target'].astype('category')
x_test = test.drop('id',axis=1)

# XGBoost

In [None]:
def runXGB(xtrain,xvalid,ytrain,yvalid,xtest,eta=0.1,num_rounds=100,max_depth=4):
    params = {
        'objective':'binary:logistic',        
        'max_depth':max_depth,
        'learning_rate':eta,
        'eval_metric':'auc',
        'min_child_weight':6,
        'subsample':0.8,
        'colsample_bytree':0.8,
        'seed':seed,
        'reg_lambda':1.3,
        'reg_alpha':8,
        'gamma':10,
        'scale_pos_weight':1.6
        #'n_thread':-1
    }
    
    dtrain = xgb.DMatrix(xtrain,label=ytrain)
    dvalid = xgb.DMatrix(xvalid,label=yvalid)
    dtest = xgb.DMatrix(xtest)
    watchlist = [(dtrain,'train'),(dvalid,'test')]
    
    model = xgb.train(params,dtrain,num_rounds,watchlist,early_stopping_rounds=50,verbose_eval=50)
    pred = model.predict(dvalid,ntree_limit=model.best_ntree_limit)
    pred_test = model.predict(dtest,ntree_limit=model.best_ntree_limit)
    return pred_test,model
    

In [None]:
kf = StratifiedKFold(n_splits=2,random_state=seed)
pred_test_full =0
cv_score = []
i=1
for train_index,test_index in kf.split(X,y):
    print('{} of KFold {}'.format(i,kf.n_splits))
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y[train_index],y[test_index]
    
    pred_test,xg_model = runXGB(xtr,xvl,ytr,yvl,x_test,num_rounds=1000,eta=0.1)    
    pred_test_full += pred_test
    cv_score.append(xg_model.best_score)
    i+=1

# Model Accuracy

In [None]:
print(cv_score)
print('Mean cv_score',np.mean(cv_score))

In [None]:
pred_xgb = pred_test_full/2

# Important Features

In [None]:
fig,ax = plt.subplots(figsize=(14,10))
xgb.plot_importance(xg_model,ax=ax,height=0.8,color='r')
plt.show()

# Predict

In [None]:
y_pred = pred_xgb
submit = pd.DataFrame({'id':test['id'],'target':y_pred})
submit.to_csv('xgb_porto.csv',index=False)