In [1]:
import pickle
import copy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Loading the merged train and test dataset

train_data = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ieee-fraud-detection/Data/merged_train.pkl')
test_data = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ieee-fraud-detection/Data/merged_test.pkl')

In [5]:
def cat_num_features(df):
    
    '''
        Utility Function to get the names of Categorical Features and 
        Numerical Features of the given Dataset.
    '''
    
    catf = []
    numf = []
    
    # Given Categorical Features 
    catf = ['ProductCD', 
            'card1', 'card2', 'card3', 'card4', 'card5','card6', 
            'addr1', 'addr2', 
            'P_emaildomain', 'R_emaildomain', 
            'M1', 'M2','M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
            'DeviceType', 'DeviceInfo'
           ]
    catf+=['id_'+str(i) for i in range(12,39)]


    # Updating the Categorical Feature Names List based on the columns present in the dataframe
    catf = [feature for feature in catf if feature in df.columns.values]
    numf = [feature for feature in df.columns if feature not in catf and not feature == 'isFraud']
    
    return (catf, numf)  

In [6]:
def label_encode(X_train, X_cv, catf):
  
  '''
    Utility Function to Encode Categorical Features.
  '''

  for f in catf:
    
    X_train[f] = X_train[f].astype(str)
    X_cv[f] = X_cv[f].astype(str)
    
    le = LabelEncoder()
    le.fit(X_train[f])
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    X_train[f] = le.transform(X_train[f])
    
    # Manually Encoding the CV and Test Dataset so as to avoid error for any category which is not present in train set
    
    # All the categories which are not present in train datset are encoded as -1    
    X_cv[f] = [-1 if mapping.get(v, -1)==-1 else mapping[v] for v in X_cv[f].values ]

  return (X_train, X_cv)

In [7]:
# https://www.kaggle.com/cdeotte/eda-for-columns-v-and-id

def reduce_group(grps):
    global train_data
    chosen = []
    for parent, children in grps.items():
      max_unique = train_data[parent].nunique(); max_unique_child = parent
      for child in children:
        n = train_data[child].nunique()
        if n>max_unique:
            max_unique = n
            max_unique_child = child
      chosen.append(max_unique_child)

    print('Use these', chosen)

    return chosen

In [8]:
# Storing Categorical and Numerical Feature Names 

catf, numf = cat_num_features(train_data)
categorical_feature_indices = [train_data.columns.get_loc(f) for f in catf]

Removing redundant features

We will be removing those features,

- Which have more than 90% missing values.
- Which have constant values for all its rows.
- Which have more than 90% values the same.

In [9]:
high_null_features_train = [f for f in train_data.columns if train_data[f].isna().sum()/len(train_data) > 0.9]
high_null_features_test = [f for f in test_data.columns if test_data[f].isna().sum()/len(test_data) > 0.9]

one_value_features_train = [f for f in train_data.columns if train_data[f].nunique()<=1]
one_value_features_test = [f for f in test_data.columns if test_data[f].nunique()<=1]

constant_value_features_train = [f for f in train_data.columns if train_data[f].value_counts(dropna=False, normalize=True).values[0] > 0.9]
constant_value_features_test = [f for f in test_data.columns if test_data[f].value_counts(dropna=False, normalize=True).values[0] > 0.9]

In [10]:
drop_cols = list(set(high_null_features_train+high_null_features_test+\
                     one_value_features_train+one_value_features_test+\
                     constant_value_features_train+constant_value_features_test))

if 'isFraud' in drop_cols:
  drop_cols.remove('isFraud')

train_data.drop(drop_cols, axis=1, inplace=True)
test_data.drop(drop_cols, axis=1, inplace=True)

In [11]:
# Storing the updated Categorical and Numerical Feature Names 

catf, numf = cat_num_features(train_data)
categorical_feature_indices = [train_data.columns.get_loc(f) for f in catf]

Removing inter correlational features

In [12]:
v_features = ["V"+str(i) for i in range(1,340) if "V"+str(i) in train_data.columns]

In [13]:
# Based on the EDA V_features can be divided into groups with same number of missig values.

v_grps = {}

for key, val in train_data[v_features].isna().sum().items():
  if(v_grps.get(val, -1)==-1):
    v_grps[val] = [key]
  else:
    v_grps[val].append(key) 

In [14]:
# V_feature groups based on same number of missing values

print("\n\nMissing Values Count    ->    Corresponding Group\n\n")
for missing_count, grp in v_grps.items():
  print(missing_count, "->", grp, "\n\n")



Missing Values Count    ->    Corresponding Group


279287 -> ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11'] 


76073 -> ['V12', 'V13', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34'] 


168969 -> ['V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52'] 


77096 -> ['V53', 'V54', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74'] 


89164 -> ['V75', 'V76', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V87', 'V90', 'V91', 'V92', 'V93', 'V94'] 


314 -> ['V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 'V131'] 


508595 -> ['V138', 'V139', 'V140', 'V141', 'V142', 'V146', 'V147', 'V148', 'V149', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V161', 'V162', 'V163'] 


508589 -> ['V143', 'V144', 'V145', 'V150', 'V151', 'V152', 'V159', 'V160', 'V164', 'V165', 'V166'

In [15]:
# Now we will be checking for correlation between the feature in the same group

for missing_count, grp in v_grps.items():
    size = (8,8)
    
    if(len(grp)<=12):
        size=(10,10)
    elif(len(grp)<=17):
        size=(15,15)
    elif(len(grp)<=22):
        size=(20,20)
    elif(len(grp)<=27):
        size=(25,25)
    else:
        size=(30,30)
        
    mask = np.triu(np.ones_like(train_data[grp].corr(), dtype=np.bool))
    plt.figure(figsize=size)
    sns.color_palette("crest", as_cmap=True)
    sns.heatmap(np.round(train_data[grp].corr(), 2), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG');
    plt.title('\nCorrelation Heat Map of V_features having {} missing values\n'.format(missing_count))
    plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [16]:
groups_with_subgroups = []

for missing_count, grp in v_grps.items():

  children = set()
  parent_dict = {parent:copy.deepcopy(children) for parent in grp}
  corr = train_data[grp].corr()
  
  for row in corr.iterrows():
    parent = row[0]
    for child,parent_child_corr in row[1].items():  
      if(parent is not child and parent_child_corr>=0.75):
        parent_dict[parent].add(child)
    
  groups_with_subgroups.append(parent_dict)

In [17]:
visited = {"V"+str(i):False for i in range(1,340) if "V"+str(i) in train_data.columns}
final_grps = []

for grps in groups_with_subgroups:
  
  grp={}

  for parent,children in grps.items():
  
    if(not visited[parent]):
      
      s = set()
      grp[parent] = copy.deepcopy(s)
      for child in children:
      
        if(not visited[child]):
      
          grp[parent].add(child)
          visited[child]=True
      
      visited[parent]=True
  
  final_grps.append(grp)

In [18]:
final_grps


[{'V1': set(),
  'V2': {'V3'},
  'V4': {'V5'},
  'V6': {'V7'},
  'V8': {'V9'},
  'V10': {'V11'}},
 {'V12': {'V13'},
  'V15': {'V16', 'V17', 'V18', 'V21', 'V22', 'V31', 'V32', 'V33', 'V34'},
  'V19': {'V20'},
  'V29': {'V30'}},
 {'V35': {'V36'},
  'V37': {'V38'},
  'V39': {'V40', 'V42', 'V43', 'V50', 'V51'},
  'V41': set(),
  'V44': {'V45'},
  'V46': {'V47'},
  'V48': {'V49'},
  'V52': set()},
 {'V53': {'V54'},
  'V56': set(),
  'V57': {'V58', 'V59', 'V60', 'V63', 'V64', 'V71', 'V72', 'V73', 'V74'},
  'V61': {'V62'},
  'V69': {'V70'}},
 {'V75': {'V76'},
  'V78': set(),
  'V79': {'V81', 'V84', 'V85', 'V92', 'V93', 'V94'},
  'V80': set(),
  'V82': {'V83'},
  'V87': set(),
  'V90': {'V91'}},
 {'V95': {'V126', 'V127', 'V128', 'V96', 'V97'},
  'V99': {'V100'},
  'V130': set(),
  'V131': set()},
 {'V138': set(),
  'V139': {'V140'},
  'V141': {'V142'},
  'V146': {'V147'},
  'V148': {'V149', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158'},
  'V161': {'V162', 'V163'}},
 {'V143': {'V164', 'V165'}

In [19]:
# Above results give us the subgroups within the groups, now from these subgroups we will choose only that feature which has the most unique value since it holds the maximum information

chosen_v_features = []

for grps in final_grps:
  chosen_v_features+=reduce_group(grps)

Use these ['V1', 'V3', 'V4', 'V6', 'V8', 'V11']
Use these ['V13', 'V17', 'V20', 'V30']
Use these ['V36', 'V37', 'V40', 'V41', 'V44', 'V47', 'V48', 'V52']
Use these ['V54', 'V56', 'V60', 'V62', 'V70']
Use these ['V76', 'V78', 'V81', 'V80', 'V82', 'V87', 'V91']
Use these ['V127', 'V99', 'V130', 'V131']
Use these ['V138', 'V139', 'V142', 'V147', 'V158', 'V162']
Use these ['V165', 'V160', 'V166']
Use these ['V203', 'V207', 'V173', 'V176', 'V183', 'V187', 'V205', 'V216', 'V215']
Use these ['V169', 'V171', 'V174', 'V175', 'V180', 'V185', 'V188', 'V195', 'V198', 'V210', 'V209']
Use these ['V274', 'V223', 'V264', 'V263', 'V230', 'V235', 'V240', 'V241', 'V257', 'V252', 'V258', 'V260', 'V262', 'V265', 'V268', 'V277']
Use these ['V220', 'V221', 'V234', 'V238', 'V250', 'V256', 'V271']
Use these ['V307', 'V285', 'V291', 'V303', 'V310', 'V312']
Use these ['V282', 'V283', 'V289', 'V315', 'V314']
Use these ['V332', 'V325', 'V326', 'V328', 'V336', 'V335', 'V339', 'V338']


In [20]:
plt.figure(figsize=(40,40))
sns.color_palette("crest", as_cmap=True)
sns.heatmap(np.round(train_data[chosen_v_features].corr(), 2), vmin=-1, vmax=1, annot=True, cmap='BrBG');
plt.title('\nCorrelation Heat Map of remaining V_features\n'.format(missing_count))
plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [21]:
not_chosen_v_fetaures = [f for f in v_features if f not in chosen_v_features]

In [22]:
train_data.drop(not_chosen_v_fetaures, axis=1, inplace=True)
test_data.drop(not_chosen_v_fetaures, axis=1, inplace=True)

In [23]:
# Train and Test Dataset Shape after performing basic Data Cleaning

print("*"*35)
print("\n Train Data Shape : {} \n".format(train_data.shape))
print("\n Test Data Shape : {} \n".format(test_data.shape))
print("*"*35)

***********************************

 Train Data Shape : (590540, 187) 


 Test Data Shape : (506691, 186) 

***********************************


In [24]:
# Saving the Cleaned Datasets

train_data.to_pickle('/content/drive/My Drive/Colab Notebooks/ieee-fraud-detection/Data/basic_clean_train.pkl')
test_data.to_pickle('/content/drive/My Drive/Colab Notebooks/ieee-fraud-detection/Data/basic_clean_test.pkl')