# Students dropout Resampling 

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [25]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [26]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [27]:
columns = [
   "mother_occupation","father_occupation","gender","Daytime_evening_attendance","marital_status", "Application_mode","Course","Previous_qualification","Mother_qualification","Father_qualification", "Target"
    
]


target = ["Target"]

In [28]:
# Load the data
file_path = Path('./dropout.csv')

In [29]:
df = pd.read_csv(file_path)

In [30]:
df = df.loc[:, columns].copy()

In [31]:
df.reset_index(inplace=True, drop=True)

In [32]:
# Replace 'dropout' with 0 and 'graduate' with 1
df['Target'] = df['Target'].replace({'Dropout': 0, 'Graduate': 1})

df.head()

Unnamed: 0,mother_occupation,father_occupation,gender,Daytime_evening_attendance,marital_status,Application_mode,Course,Previous_qualification,Mother_qualification,Father_qualification,Target
0,6,10,1,1,1,8,2,1,13,10,0
1,4,4,1,1,1,6,11,1,1,3,1
2,10,10,1,1,1,1,5,1,22,27,0
3,6,4,0,1,1,8,15,1,23,27,1
4,10,10,0,0,2,12,3,1,22,28,1


# Split the Data into Training and Testing

In [33]:
# Create our features
X = df.drop(columns=["Target"])

X = pd.get_dummies(X, columns=["mother_occupation", "father_occupation","gender", "marital_status","Application_mode"
    ])

In [34]:
# Create our target
y = df["Target"]

In [35]:
X

Unnamed: 0,Daytime_evening_attendance,Course,Previous_qualification,Mother_qualification,Father_qualification,mother_occupation_1,mother_occupation_2,mother_occupation_3,mother_occupation_4,mother_occupation_5,...,Application_mode_9,Application_mode_10,Application_mode_11,Application_mode_12,Application_mode_13,Application_mode_14,Application_mode_15,Application_mode_16,Application_mode_17,Application_mode_18
0,1,2,1,13,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,11,1,1,3,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,5,1,22,27,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,15,1,23,27,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,3,1,22,28,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3625,1,15,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3626,1,15,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3627,1,12,1,22,27,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3628,1,9,1,22,27,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
df.dtypes

mother_occupation             int64
father_occupation             int64
gender                        int64
Daytime_evening_attendance    int64
marital_status                int64
Application_mode              int64
Course                        int64
Previous_qualification        int64
Mother_qualification          int64
Father_qualification          int64
Target                        int64
dtype: object

In [37]:
y.dtypes
X.dtypes

Daytime_evening_attendance    int64
Course                        int64
Previous_qualification        int64
Mother_qualification          int64
Father_qualification          int64
                              ...  
Application_mode_14           uint8
Application_mode_15           uint8
Application_mode_16           uint8
Application_mode_17           uint8
Application_mode_18           uint8
Length: 102, dtype: object

In [38]:
X.describe()

Unnamed: 0,Daytime_evening_attendance,Course,Previous_qualification,Mother_qualification,Father_qualification,mother_occupation_1,mother_occupation_2,mother_occupation_3,mother_occupation_4,mother_occupation_5,...,Application_mode_9,Application_mode_10,Application_mode_11,Application_mode_12,Application_mode_13,Application_mode_14,Application_mode_15,Application_mode_16,Application_mode_17,Application_mode_18
count,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0,...,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0,3630.0
mean,0.887603,9.935537,2.552617,12.558678,16.663636,0.039394,0.023967,0.066116,0.074931,0.184573,...,0.029752,0.000275,0.000275,0.18292,0.018182,0.065289,0.043251,0.012672,0.006887,0.000275
std,0.315897,4.340715,3.95244,9.006183,10.993025,0.194557,0.152967,0.248518,0.263316,0.388004,...,0.169926,0.016598,0.016598,0.386654,0.133627,0.24707,0.203449,0.111871,0.082713,0.016598
min,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,6.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,11.0,1.0,13.0,14.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,13.0,1.0,22.0,27.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,17.0,17.0,29.0,34.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [39]:
# Check the balance of our target values
y.value_counts()

1    2209
0    1421
Name: Target, dtype: int64

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({1: 1656, 0: 1066})

# Oversampling

### Naive Random Oversampling

In [41]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 1656, 0: 1656})

In [42]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [43]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6692101978962381

In [44]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[219, 136],
       [154, 399]])

In [45]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.59      0.62      0.72      0.60      0.67      0.44       355
          1       0.75      0.72      0.62      0.73      0.67      0.45       553

avg / total       0.68      0.68      0.66      0.68      0.67      0.45       908

