# Loading dataset

In [1]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/catanaranjo/fraudDetection/master/dev.csv") 
dfo = pd.read_csv("https://raw.githubusercontent.com/catanaranjo/fraudDetection/master/oot0.csv")

# Creating new column for identifying train and test dataset

In [2]:
df['dataset'] = 'train'
dfo['dataset'] = 'test'
dfo['ob_target'] = 0
trainTestDF= df.append(dfo)

# Identifying numeric features

In [3]:
numericList = []
for i in range(65,82):
    numericList.append('if_var_' + str(i))

# Removing outliers

In [4]:
import numpy as np
import warnings; warnings.simplefilter('ignore')
def change_outliers(dataframe):
    list = []
    list = numericList
    for column in list:
        q1 = dataframe[column].quantile(0.25)
        q3 = dataframe[column].quantile(0.75)
        iqr = q3-q1 #Interquartile range
        highest = q3 + 1.5*iqr
        lowest = q1 - 1.5*iqr
        median = dataframe.loc[dataframe[column]<q3 + 1.5*iqr, column].median()
        outliersUp = (dataframe[column]).abs() > round(q3 + 1.5*iqr)
        outliersDown = (dataframe[column]).abs() < round(q1 - 1.5*iqr)
        dataframe[column][outliersUp] = np.nan
        dataframe[column].fillna(highest, inplace=True)
        dataframe[column][outliersDown] = np.nan
        dataframe[column].fillna(lowest, inplace=True)

numericDF = trainTestDF[numericList]
change_outliers(numericDF)

# Removing NA values

In [5]:
trainTestDF['ib_var_12']=trainTestDF.ib_var_12.transform(lambda x: x.fillna(x.mode()[0]))
trainTestDF['ib_var_15']=trainTestDF.ib_var_15.transform(lambda x: x.fillna(x.mode()[0]))
trainTestDF['ib_var_16']=trainTestDF.ib_var_16.transform(lambda x: x.fillna(x.mode()[0]))
trainTestDF['ib_var_17']=trainTestDF.ib_var_17.transform(lambda x: x.fillna(x.mode()[0]))
trainTestDF['ib_var_18']=trainTestDF.ib_var_18.transform(lambda x: x.fillna(x.mode()[0]))
trainTestDF['ib_var_19']=trainTestDF.ib_var_19.transform(lambda x: x.fillna(x.mode()[0]))
trainTestDF['ib_var_20']=trainTestDF.ib_var_20.transform(lambda x: x.fillna(x.mode()[0]))
trainTestDF['ib_var_21']=trainTestDF.ib_var_21.transform(lambda x: x.fillna(x.mode()[0]))
trainTestDF['icn_var_22']=trainTestDF.icn_var_22.transform(lambda x: x.fillna(x.mode()[0]))
trainTestDF['icn_var_24']=trainTestDF.icn_var_24.transform(lambda x: x.fillna(x.mode()[0]))
trainTestDF['ico_var_26']=trainTestDF.ico_var_26.transform(lambda x: x.fillna(x.median()))
trainTestDF['ico_var_33']=trainTestDF.ico_var_33.transform(lambda x: x.fillna(x.median()))
trainTestDF['ico_var_37']=trainTestDF.ico_var_37.transform(lambda x: x.fillna(x.median()))
trainTestDF['if_var_78']=trainTestDF.if_var_78.transform(lambda x: x.fillna(x.median()))

# PCA for dimensionality reduction

In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numericDF = trainTestDF[numericList]
numericDF = pd.DataFrame(scaler.fit_transform(numericDF),columns=numericDF.columns)
numericDF.head()

pca = PCA(n_components=8,random_state=42)
transformed = pca.fit_transform(numericDF)

pcaDF = pd.DataFrame(data = transformed
             , columns = ['PCA1', 'PCA2', 'PCA3','PCA4','PCA5','PCA6','PCA7','PCA8'])
pcaDF.reset_index(inplace=True)
trainTestDF = trainTestDF.join(pcaDF)

# OHE for categorical features

In [7]:
categorical_transformed_trainDF = trainTestDF[trainTestDF['dataset']=='train']
categorical_transformed_testDF = trainTestDF[trainTestDF['dataset']=='test']

# y_train = trainTestDF['ob_target']
y_train = categorical_transformed_trainDF['ob_target']

drops = ['ob_target','dataset']
categorical_ohe_train = categorical_transformed_trainDF.loc[:,~categorical_transformed_trainDF.columns.isin(drops)]
categorical_ohe_test = categorical_transformed_testDF.loc[:,~categorical_transformed_testDF.columns.isin(drops)]

X_train = pd.get_dummies(categorical_ohe_train) 
X_test = pd.get_dummies(categorical_ohe_test)
X_train.head()

Unnamed: 0,id,ib_var_1,ib_var_2,ib_var_3,ib_var_4,ib_var_5,ib_var_6,ib_var_7,ib_var_8,ib_var_9,...,if_var_81,index,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8
0,1,0,1,0,0,1,1,0,0,0,...,1,0,1.702443,-0.24049,-1.858499,-0.151677,0.666923,0.086148,-0.543337,0.091393
1,2,0,1,0,0,0,1,0,0,0,...,4,1,0.651663,-0.711393,1.271747,2.329395,1.393649,0.745848,-0.610135,0.152565
2,3,0,0,0,0,1,1,0,0,0,...,2,2,0.908728,0.399952,0.895837,-0.064963,-0.342891,0.816829,-0.470053,-0.042002
3,4,0,1,0,1,1,1,0,0,0,...,3,3,1.794498,-0.480808,0.091275,1.277642,1.36812,2.154262,-0.365349,0.675085
4,5,0,0,0,0,0,1,0,0,0,...,3,4,0.052171,0.127745,1.042783,-0.379492,-0.488249,-1.109839,0.226669,-0.254306


# Baseline Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

baseline_rf = RandomForestClassifier(n_jobs=-1,random_state=42,n_estimators=4000,criterion='entropy',max_features='log2',
                                    bootstrap=True, min_samples_split=3,min_samples_leaf=1,oob_score=True)

baseline_rf.fit(X_train,y_train)
y_pred_rf = baseline_rf.predict_proba(X_train)[:,1]

 
gini_score = 2*roc_auc_score(y_train, y_pred_rf)-1
print ("GINI DEVELOPMENT=", gini_score)

GINI DEVELOPMENT= 1.0
