# Balanced Random Forest Claasifier

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

### Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
file_path = Path('../dropout.csv')
df = pd.read_csv(file_path)
df.head()



Unnamed: 0,ID,mother_occupation,father_occupation,gender,Daytime_evening_attendance,Target,marital_status,Nacionality,Application_mode,Course,Previous_qualification,Mother_qualification,Father_qualification
0,0,6,10,1,1,Dropout,1,1,8,2,1,13,10
1,1,4,4,1,1,Graduate,1,1,6,11,1,1,3
2,2,10,10,1,1,Dropout,1,1,1,5,1,22,27
3,3,6,4,0,1,Graduate,1,1,8,15,1,23,27
4,4,10,10,0,0,Graduate,2,1,12,3,1,22,28


In [5]:
# Replace 'dropout' with 0 and 'graduate' with 1
df['Target'] = df['Target'].replace({'Dropout': 0, 'Graduate': 1})

df.head()

Unnamed: 0,ID,mother_occupation,father_occupation,gender,Daytime_evening_attendance,Target,marital_status,Nacionality,Application_mode,Course,Previous_qualification,Mother_qualification,Father_qualification
0,0,6,10,1,1,0,1,1,8,2,1,13,10
1,1,4,4,1,1,1,1,1,6,11,1,1,3
2,2,10,10,1,1,0,1,1,1,5,1,22,27
3,3,6,4,0,1,1,1,1,8,15,1,23,27
4,4,10,10,0,0,1,2,1,12,3,1,22,28


In [6]:
df.dtypes

ID                            int64
mother_occupation             int64
father_occupation             int64
gender                        int64
Daytime_evening_attendance    int64
Target                        int64
marital_status                int64
Nacionality                   int64
Application_mode              int64
Course                        int64
Previous_qualification        int64
Mother_qualification          int64
Father_qualification          int64
dtype: object

In [7]:
# Create our features
X = df.drop(columns=['Target'])

# Create our target
y = df['Target']



In [9]:
# Check the balance of our target values
y.value_counts()

1    2209
0    1421
Name: Target, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

Counter(y_train)

Counter({1: 1656, 0: 1066})

In [11]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
brf_model = brf_model.fit(X_train, y_train)

# Making predictions using the testing data
predictions = brf_model.predict(X_test)

In [12]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
from sklearn.metrics import balanced_accuracy_score

y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6575783816825

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [14]:
# Display the confusion matrix
# YOUR CODE HEREconfusion_matrix(y_test, y_pred)
confusion_matrix(y_test, predictions)

array([[230, 125],
       [184, 369]])

In [15]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.56      0.65      0.67      0.60      0.66      0.43       355
          1       0.75      0.67      0.65      0.70      0.66      0.43       553

avg / total       0.67      0.66      0.66      0.66      0.66      0.43       908

