# Lab | Random Forests

For this lab, you will be using the CSV files provided in the files_for_lab folder

## Instructions

- Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE
- Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
numerical = pd.read_csv('files_for_lab/numerical.csv')
categorical = pd.read_csv('files_for_lab/categorical.csv')
targets = pd.read_csv('files_for_lab/target.csv')

## Targets

In [3]:
targets['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [4]:
targets['TARGET_D'].value_counts()

0.00     90569
10.00      941
15.00      591
20.00      577
5.00       503
         ...  
4.50         1
55.00        1
18.25        1
16.87        1
48.00        1
Name: TARGET_D, Length: 71, dtype: int64

#### Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case

## Encode categorical variables

In [5]:
categorical_encoded = pd.get_dummies(categorical,drop_first=True)
categorical_encoded.head()

Unnamed: 0,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,36,3,2,89,1,37,12,92,8,94,...,1,0,0,0,1,0,0,0,1,0
1,14,3,1,94,1,52,2,93,10,95,...,0,0,1,0,0,0,0,1,0,0
2,43,3,2,90,1,0,2,91,11,92,...,1,0,0,0,1,0,1,0,0,0
3,44,3,2,87,1,28,1,87,11,94,...,1,0,0,0,1,0,1,0,0,0
4,16,3,2,86,1,20,1,93,10,96,...,0,1,0,0,0,0,0,1,0,0


In [6]:
data = pd.concat([categorical_encoded,numerical,targets],axis=1)
data

Unnamed: 0,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D
0,36,3,2,89,1,37,12,92,8,94,...,12.0,10.0,4,7.741935,95515,0,4,39,0,0.0
1,14,3,1,94,1,52,2,93,10,95,...,25.0,25.0,18,15.666667,148535,0,2,1,0,0.0
2,43,3,2,90,1,0,2,91,11,92,...,16.0,5.0,12,7.481481,15078,1,4,60,0,0.0
3,44,3,2,87,1,28,1,87,11,94,...,11.0,10.0,9,6.812500,172556,1,4,41,0,0.0
4,16,3,2,86,1,20,1,93,10,96,...,15.0,15.0,14,6.864865,7112,1,2,26,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,27,3,2,96,1,0,2,96,2,96,...,25.0,25.0,9,25.000000,184568,0,1,12,0,0.0
95408,24,3,1,96,1,50,1,96,3,96,...,20.0,20.0,9,20.000000,122706,1,1,2,0,0.0
95409,30,3,3,95,1,38,1,96,3,95,...,10.0,10.0,3,8.285714,189641,1,3,34,0,0.0
95410,24,2,1,86,1,40,5,90,11,96,...,21.0,18.0,4,12.146341,4693,1,4,11,1,18.0


In [7]:
X = data.drop('TARGET_B',axis = 1)
y = data['TARGET_B']

In [8]:
y.value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

## Upscaling the data using SMOTE

In [9]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_sm, y_sm = sm.fit_resample(X, y)

In [10]:
y_sm.value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

## Train Test split on upscaled data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.4, random_state=0)

## Apply the Random Forests algorithm

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score


kf = KFold(n_splits=2)  ## Apply cross validation
results = []

for train_index, test_index in kf.split(X_sm):
    X_train, X_test = X_sm.loc[train_index, :], X_sm.loc[test_index, :]
    y_train, y_test = y_sm[train_index], y_sm[test_index]
    
    model = RandomForestClassifier(n_estimators = 10,random_state = 24)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results.append(f1_score(y_test, y_pred))
        
print("Accuracy of Random Forest Model is: ", np.mean(results))


Accuracy of Random Forest Model is:  0.9546183050585612
