In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
import sklearn.metrics as sm 
import time
from tqdm import tqdm
from timeit import default_timer as timer
import time
import warnings

In [12]:
def read_parquet(path='', cols=None):
    """
        Returns pandas dataframe

        Parameters
        -----------
        path to training data in parquet format

        Returns
        ----------
        Pandas DataFrame

        """
    # LOAD DATAFRAME
    if cols is not None:
        dataFrame = pd.read_parquet(path, columns=cols)
    else:
        dataFrame = pd.read_parquet(path)

    
    dataFrame['customer_ID'] = dataFrame['customer_ID'].str[-16:].apply(int, base=16).astype('int64')
    dataFrame.S_2 = pd.to_datetime(dataFrame.S_2)

    
    dataFrame = dataFrame.fillna(-127)
    print('shape of data:', dataFrame.shape)

    return dataFrame


print('Reading train data...')
TRAIN_PATH = 'train.parquet'
train = read_parquet(path=TRAIN_PATH)

Reading train data...
shape of data: (5531451, 190)


In [14]:
def feature_engineering(df):
    """
    :param df: pandas dataframe of train dataset
    :return: feature engineered data
    
    """
    all_col_of_df = [c for c in list(df.columns) if c not in ['customer_ID', 'S_2']]
    cat_features = ["B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126", "D_63", "D_64", "D_66", "D_68"]
    num_features = [col for col in all_col_of_df if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = pd.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape)

    return df


train = feature_engineering(train)


shape after engineering (458913, 918)


In [15]:
# ADD TARGETS
output_var = pd.read_csv('C:/Users/shant/Downloads/train_labels.csv')
output_var['customer_ID'] = output_var['customer_ID'].str[-16:].apply(int, base=16).astype('int64')
output_var = output_var.set_index('customer_ID')
train = train.merge(output_var, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del output_var


train = train.sort_index().reset_index()

# FEATURES
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

There are 918 features!


In [16]:
FEATURES

Index(['P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last', 'D_39_mean',
       'D_39_std', 'D_39_min', 'D_39_max', 'D_39_last',
       ...
       'D_63_nunique', 'D_64_count', 'D_64_last', 'D_64_nunique', 'D_66_count',
       'D_66_last', 'D_66_nunique', 'D_68_count', 'D_68_last', 'D_68_nunique'],
      dtype='object', length=918)

In [17]:
train.isnull().sum()[train.isnull().sum() != 0].index

Index(['P_2_std', 'D_39_std', 'B_1_std', 'B_2_std', 'R_1_std', 'S_3_std',
       'D_41_std', 'B_3_std', 'D_42_std', 'D_43_std',
       ...
       'D_136_std', 'D_137_std', 'D_138_std', 'D_139_std', 'D_140_std',
       'D_141_std', 'D_142_std', 'D_143_std', 'D_144_std', 'D_145_std'],
      dtype='object', length=177)

In [18]:
shape_before_remove = train.shape[0]
train = train.dropna()
print('Number of records removed: ',abs(shape_before_remove - train.shape[0]))
print('% of customers removed from data: ', abs(shape_before_remove - train.shape[0]) * 100 /shape_before_remove)

Number of records removed:  5120
% of customers removed from data:  1.1156798783211632


In [19]:
X = train.drop(labels=['customer_ID','target'], axis = 1)
y = train.loc[:,'target']
print(X.shape)
print(y.shape)

(453793, 918)
(453793,)


### Random Forest model on Imbalanced Dataset

The Random Forests model was analyzed with several runs for the following set of parameters with different number of trees i.e., starting from 100 to 500 trees with an increment of 100 trees each time to determine the optimal number of trees.  

Criterion =  ‘gini’: Gini index for calculating information gain.  

max_features : ‘sqrt’: Square root of the total number of features in individual run. 

Bootstrap = True:  Samples are drawn with replacement 

random_state = 1: Controls randomness of the sample

It was observed that changing the number of trees did not affect the test performance significantly. Hence, the number of trees set to 100 being computationally inexpensive and for algorithm to generalize well.

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
from sklearn.ensemble import RandomForestClassifier


clf=RandomForestClassifier(n_estimators=100,
                           criterion='gini',
                           random_state=1,
                           max_features='sqrt',
                           bootstrap=True,
                           n_jobs=2, verbose=True)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  3.8min
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  8.5min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    2.2s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    4.6s finished


In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93    100948
           1       0.80      0.79      0.80     35190

    accuracy                           0.90    136138
   macro avg       0.86      0.86      0.86    136138
weighted avg       0.89      0.90      0.90    136138



Evaluating the results of trained Random Forest model for performance metrics show 93% recall for class 0 and 79% for class 1, with an accuracy of 90%.

### Attempt for Grid Search for Optimal Hyper-parameters

To improve the performance furthermore, we aim to obtain optimal parameters thereby performing Grid Search.

from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators' : [10, 20, 40, 60, 80],
             'criterion' : ['gini'],
             'max_depth': [5, 7, 10, 15, 20, 50],
              'max_features' : ['sqrt'],
              'bootstrap' : [True]
              }
rf_grid = RandomForestClassifier()
grid_search = GridSearchCV(rf_grid, param_grid, n_jobs=5, cv=5, verbose=True)
grid_search.fit(X_train, y_train)
grid_search.best_params_
grid_search.best_estimator_

y_pred1 = grid_search.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred1))
print("F1-score:",metrics.f1_score(y_test, y_pred1))

It kept processing for more that 24hours without any results and hence conclused to interrupt the process.

### Random Forest model with SMOTE

The dataset includes imbalanced classes, to overcome this bias, we need a balanced distribution of classes. Since, performing SMOTE (Synthetic Minority Oversampling Technique) on whole datase creates overfitting, thus only applied to training set to gain operational performace and test set provides an estimate on the performance.

In [24]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()
X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)

clf_sm=RandomForestClassifier(n_estimators=100,
                              criterion='gini',
                              random_state=1,
                              max_features='sqrt',
                              bootstrap=True,
                              n_jobs=2, verbose=True)
clf_sm.fit(X_train_oversampled,y_train_oversampled)

y_pred_sm=clf_sm.predict(X_test)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  7.2min
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed: 13.3min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    2.4s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    4.8s finished


In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_sm))

              precision    recall  f1-score   support

           0       0.94      0.90      0.92    100948
           1       0.76      0.85      0.80     35190

    accuracy                           0.89    136138
   macro avg       0.85      0.88      0.86    136138
weighted avg       0.90      0.89      0.89    136138



Model trained over SMOTE dataset clearly show improved recall for 85% for class 1, however a reduced 90% for class 0 and 89% accuracy on test dataset. However, no considerable change in f1-score in comparison to the results obtained for the earlier model. Nevertheless, an overall improved accuracy in predicting True Positives and True Negatives was achieved. 