In [304]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pathlib as pl
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [305]:
path = pl.Path('../../data/processed/data_set.csv')

df = pd.read_csv(path)

df.head()

Unnamed: 0,student_id,age,gender,stratum,residence,civil_status,total_repetitions,deserter,average
0,1,37,1,2,0,0,1,1,3.233333
1,2,35,1,2,1,2,0,1,4.64
2,3,57,1,2,0,2,8,1,4.16
3,4,31,1,2,0,0,0,1,4.0
4,5,32,1,2,0,0,1,0,4.02


In [306]:
print(df.isna().sum())

student_id           0
age                  0
gender               0
stratum              0
residence            0
civil_status         0
total_repetitions    0
deserter             0
average              4
dtype: int64


In [307]:
df = df.fillna(0)

In [308]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   student_id         460 non-null    int64  
 1   age                460 non-null    int64  
 2   gender             460 non-null    int64  
 3   stratum            460 non-null    int64  
 4   residence          460 non-null    int64  
 5   civil_status       460 non-null    int64  
 6   total_repetitions  460 non-null    int64  
 7   deserter           460 non-null    int64  
 8   average            460 non-null    float64
dtypes: float64(1), int64(8)
memory usage: 32.5 KB


In [309]:
count = df['deserter'].value_counts()

count.head()

deserter
0    280
1    180
Name: count, dtype: int64

# Unbalanced classes [0.7/0.3]

In [310]:
X = df.drop(['deserter','student_id'],axis=1)
y = df['deserter']

In [311]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

model = LogisticRegression(max_iter=5000)

model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,5000


In [312]:
y_pred = model.predict(X_test)

In [313]:
print('Accuracy Score',accuracy_score(y_test,y_pred))
print()
print('Confusion matrix',confusion_matrix(y_test,y_pred))
print()
print('Classification Report',classification_report(y_test,y_pred))

Accuracy Score 0.7246376811594203

Confusion matrix [[69 14]
 [24 31]]

Classification Report               precision    recall  f1-score   support

           0       0.74      0.83      0.78        83
           1       0.69      0.56      0.62        55

    accuracy                           0.72       138
   macro avg       0.72      0.70      0.70       138
weighted avg       0.72      0.72      0.72       138



# Balanced classes [0.7/0.3]

In [314]:
df_majority = df[df['deserter'] == 0]
df_minority = df[df['deserter'] == 1]

df_majority_reduced = df_majority.sample(n=len(df_minority), random_state=42)

In [315]:
df_balanced = pd.concat([df_majority_reduced,df_minority])

# Mix the rows

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [316]:
print("Distribution after undersampling")
df_balanced['deserter'].value_counts()

Distribution after undersampling


deserter
1    180
0    180
Name: count, dtype: int64

In [317]:
X = df_balanced.drop(['deserter','student_id'],axis=1)
y = df_balanced['deserter']

In [318]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

model = LogisticRegression(max_iter=5000)

model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,5000


In [319]:
y_pred = model.predict(X_test)

In [320]:
print('Accuracy Score',accuracy_score(y_test,y_pred))
print()
print('Confusion matrix',confusion_matrix(y_test,y_pred))
print()
print('Classification Report',classification_report(y_test,y_pred))

Accuracy Score 0.7407407407407407

Confusion matrix [[44  7]
 [21 36]]

Classification Report               precision    recall  f1-score   support

           0       0.68      0.86      0.76        51
           1       0.84      0.63      0.72        57

    accuracy                           0.74       108
   macro avg       0.76      0.75      0.74       108
weighted avg       0.76      0.74      0.74       108



# Unbalanced classes [0.8/0.2]

In [321]:
X = df.drop(['deserter','student_id'],axis=1)
y = df['deserter']

In [322]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

model = LogisticRegression(max_iter=5000)

model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,5000


In [323]:
y_pred = model.predict(X_test)

In [324]:
print('Accuracy Score',accuracy_score(y_test,y_pred))
print()
print('Confusion matrix',confusion_matrix(y_test,y_pred))
print()
print('Classification Report',classification_report(y_test,y_pred))

Accuracy Score 0.7065217391304348

Confusion matrix [[41 15]
 [12 24]]

Classification Report               precision    recall  f1-score   support

           0       0.77      0.73      0.75        56
           1       0.62      0.67      0.64        36

    accuracy                           0.71        92
   macro avg       0.69      0.70      0.70        92
weighted avg       0.71      0.71      0.71        92



# Balanced classes [0.8/0.2]

In [325]:
df_majority = df[df['deserter'] == 0]
df_minority = df[df['deserter'] == 1]

df_majority_reduced = df_majority.sample(n=len(df_minority), random_state=42)

In [326]:
df_balanced = pd.concat([df_majority_reduced,df_minority])

# Mix the rows

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [327]:
print("Distribution after undersampling")
df_balanced['deserter'].value_counts()

Distribution after undersampling


deserter
1    180
0    180
Name: count, dtype: int64

In [328]:
X = df_balanced.drop(['deserter','student_id'],axis=1)
y = df_balanced['deserter']

In [329]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

model = LogisticRegression(max_iter=5000)

model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,5000


In [330]:
y_pred = model.predict(X_test)

In [331]:
print('Accuracy Score',accuracy_score(y_test,y_pred))
print()
print('Confusion matrix',confusion_matrix(y_test,y_pred))
print()
print('Classification Report',classification_report(y_test,y_pred))

Accuracy Score 0.6944444444444444

Confusion matrix [[22  9]
 [13 28]]

Classification Report               precision    recall  f1-score   support

           0       0.63      0.71      0.67        31
           1       0.76      0.68      0.72        41

    accuracy                           0.69        72
   macro avg       0.69      0.70      0.69        72
weighted avg       0.70      0.69      0.70        72



# Using SMOTE for Class Balancing

SMOTE (Synthetic Minority Oversampling Technique) generates synthetic examples.

In [332]:
# Load original unbalanced data
X = df.drop(['deserter','student_id'],axis=1)
y = df['deserter']

print("Original class distribution:")
print(y.value_counts())
print(f"Class ratio: {y.value_counts()[0]/y.value_counts()[1]:.2f}:1")

Original class distribution:
deserter
0    280
1    180
Name: count, dtype: int64
Class ratio: 1.56:1


In [333]:
# Split the data first (important: apply SMOTE only to training data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training set class distribution before SMOTE:")
print(y_train.value_counts())

Training set class distribution before SMOTE:
deserter
0    197
1    125
Name: count, dtype: int64


In [334]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Training set class distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())
print("New class ratio: 1:1 (perfectly balanced)")

Training set class distribution after SMOTE:
deserter
0    197
1    197
Name: count, dtype: int64
New class ratio: 1:1 (perfectly balanced)


In [335]:
model_smote = LogisticRegression(max_iter=5000)
model_smote.fit(X_train_smote, y_train_smote)

print("Model trained successfully with SMOTE-balanced data!")

Model trained successfully with SMOTE-balanced data!


In [336]:
y_pred_smote = model_smote.predict(X_test)

print('SVM with SMOTE Results')
print('Accuracy Score:', accuracy_score(y_test, y_pred_smote))
print()
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_smote))
print()
print('Classification Report:')
print(classification_report(y_test, y_pred_smote))

SVM with SMOTE Results
Accuracy Score: 0.7246376811594203

Confusion Matrix:
[[62 21]
 [17 38]]

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.77        83
           1       0.64      0.69      0.67        55

    accuracy                           0.72       138
   macro avg       0.71      0.72      0.72       138
weighted avg       0.73      0.72      0.73       138



## Comparison: SMOTE vs Other Techniques

SMOTE has several advantages over simple undersampling:

1. **No information loss**: Unlike undersampling, SMOTE doesn't discard any majority class examples
2. **Synthetic data generation**: Creates realistic synthetic examples by interpolating between minority class neighbors  
3. **Better generalization**: Often leads to better model performance on unseen data
4. **Preserves data patterns**: Maintains the original data distribution characteristics