In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pathlib as pl
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
path = pl.Path('../../data/processed/data_set.csv')

df = pd.read_csv(path)

df.head()

Unnamed: 0,student_id,age,gender,stratum,residence,civil_status,total_repetitions,deserter,average
0,1,37,1,2,0,0,1,1,3.233333
1,2,35,1,2,1,2,0,1,4.64
2,3,57,1,2,0,2,8,1,4.16
3,4,31,1,2,0,0,0,1,4.0
4,5,32,1,2,0,0,1,0,4.02


In [3]:
print(df.isna().sum())

student_id           0
age                  0
gender               0
stratum              0
residence            0
civil_status         0
total_repetitions    0
deserter             0
average              4
dtype: int64


In [4]:
df = df.fillna(0)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   student_id         460 non-null    int64  
 1   age                460 non-null    int64  
 2   gender             460 non-null    int64  
 3   stratum            460 non-null    int64  
 4   residence          460 non-null    int64  
 5   civil_status       460 non-null    int64  
 6   total_repetitions  460 non-null    int64  
 7   deserter           460 non-null    int64  
 8   average            460 non-null    float64
dtypes: float64(1), int64(8)
memory usage: 32.5 KB


In [6]:
count = df['deserter'].value_counts()

count.head()

deserter
0    280
1    180
Name: count, dtype: int64

# Unbalanced classes [0.7/0.3]

In [7]:
X = df.drop(['deserter','student_id'],axis=1)
y = df['deserter']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

model = KNeighborsClassifier(n_neighbors=5)

model.fit(X_train,y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [9]:
y_pred = model.predict(X_test)

In [10]:
print('Accuracy Score',accuracy_score(y_test,y_pred))
print()
print('Confusion matrix',confusion_matrix(y_test,y_pred))
print()
print('Classification Report',classification_report(y_test,y_pred))

Accuracy Score 0.6521739130434783

Confusion matrix [[61 22]
 [26 29]]

Classification Report               precision    recall  f1-score   support

           0       0.70      0.73      0.72        83
           1       0.57      0.53      0.55        55

    accuracy                           0.65       138
   macro avg       0.63      0.63      0.63       138
weighted avg       0.65      0.65      0.65       138



# Balanced classes [0.7/0.3]

In [11]:
df_majority = df[df['deserter'] == 0]
df_minority = df[df['deserter'] == 1]

df_majority_reduced = df_majority.sample(n=len(df_minority), random_state=42)

In [12]:
df_balanced = pd.concat([df_majority_reduced,df_minority])

# Mix the rows

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
print("Distribution after undersampling")
df_balanced['deserter'].value_counts()

Distribution after undersampling


deserter
1    180
0    180
Name: count, dtype: int64

In [14]:
X = df_balanced.drop(['deserter','student_id'],axis=1)
y = df_balanced['deserter']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

model = KNeighborsClassifier(n_neighbors=5)

model.fit(X_train,y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [16]:
y_pred = model.predict(X_test)

In [17]:
print('Accuracy Score',accuracy_score(y_test,y_pred))
print()
print('Confusion matrix',confusion_matrix(y_test,y_pred))
print()
print('Classification Report',classification_report(y_test,y_pred))

Accuracy Score 0.8055555555555556

Confusion matrix [[41 10]
 [11 46]]

Classification Report               precision    recall  f1-score   support

           0       0.79      0.80      0.80        51
           1       0.82      0.81      0.81        57

    accuracy                           0.81       108
   macro avg       0.80      0.81      0.81       108
weighted avg       0.81      0.81      0.81       108



# Unbalanced classes [0.8/0.2]

In [18]:
X = df.drop(['deserter','student_id'],axis=1)
y = df['deserter']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

model = KNeighborsClassifier(n_neighbors=5)

model.fit(X_train,y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [20]:
y_pred = model.predict(X_test)

In [21]:
print('Accuracy Score',accuracy_score(y_test,y_pred))
print()
print('Confusion matrix',confusion_matrix(y_test,y_pred))
print()
print('Classification Report',classification_report(y_test,y_pred))

Accuracy Score 0.5978260869565217

Confusion matrix [[35 21]
 [16 20]]

Classification Report               precision    recall  f1-score   support

           0       0.69      0.62      0.65        56
           1       0.49      0.56      0.52        36

    accuracy                           0.60        92
   macro avg       0.59      0.59      0.59        92
weighted avg       0.61      0.60      0.60        92



# Balanced classes [0.8/0.2]

In [22]:
df_majority = df[df['deserter'] == 0]
df_minority = df[df['deserter'] == 1]

df_majority_reduced = df_majority.sample(n=len(df_minority), random_state=42)

In [23]:
df_balanced = pd.concat([df_majority_reduced,df_minority])

# Mix the rows

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [24]:
print("Distribution after undersampling")
df_balanced['deserter'].value_counts()

Distribution after undersampling


deserter
1    180
0    180
Name: count, dtype: int64

In [25]:
X = df_balanced.drop(['deserter','student_id'],axis=1)
y = df_balanced['deserter']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

model = KNeighborsClassifier(n_neighbors=5)

model.fit(X_train,y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [27]:
y_pred = model.predict(X_test)

In [28]:
print('Accuracy Score',accuracy_score(y_test,y_pred))
print()
print('Confusion matrix',confusion_matrix(y_test,y_pred))
print()
print('Classification Report',classification_report(y_test,y_pred))

Accuracy Score 0.8055555555555556

Confusion matrix [[23  8]
 [ 6 35]]

Classification Report               precision    recall  f1-score   support

           0       0.79      0.74      0.77        31
           1       0.81      0.85      0.83        41

    accuracy                           0.81        72
   macro avg       0.80      0.80      0.80        72
weighted avg       0.80      0.81      0.80        72



# Using SMOTE for Class Balancing

SMOTE (Synthetic Minority Oversampling Technique) generates synthetic examples.

In [29]:
# Load original unbalanced data
X = df.drop(['deserter','student_id'],axis=1)
y = df['deserter']

print("Original class distribution:")
print(y.value_counts())
print(f"Class ratio: {y.value_counts()[0]/y.value_counts()[1]:.2f}:1")

Original class distribution:
deserter
0    280
1    180
Name: count, dtype: int64
Class ratio: 1.56:1


In [30]:
# Split the data first (important: apply SMOTE only to training data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training set class distribution before SMOTE:")
print(y_train.value_counts())

Training set class distribution before SMOTE:
deserter
0    197
1    125
Name: count, dtype: int64


In [31]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Training set class distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())
print("New class ratio: 1:1 (perfectly balanced)")

Training set class distribution after SMOTE:
deserter
0    197
1    197
Name: count, dtype: int64
New class ratio: 1:1 (perfectly balanced)


In [32]:
model_smote = KNeighborsClassifier(n_neighbors=5)
model_smote.fit(X_train_smote, y_train_smote)

print("Model trained successfully with SMOTE-balanced data!")

Model trained successfully with SMOTE-balanced data!


In [None]:
y_pred_smote = model_smote.predict(X_test)

print('KNN with SMOTE Results')
print('Accuracy Score:', accuracy_score(y_test, y_pred_smote))
print()
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_smote))
print()
print('Classification Report:')
print(classification_report(y_test, y_pred_smote))

SVM with SMOTE Results
Accuracy Score: 0.6811594202898551

Confusion Matrix:
[[57 26]
 [18 37]]

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.69      0.72        83
           1       0.59      0.67      0.63        55

    accuracy                           0.68       138
   macro avg       0.67      0.68      0.67       138
weighted avg       0.69      0.68      0.68       138



## Comparison: SMOTE vs Other Techniques

SMOTE has several advantages over simple undersampling:

1. **No information loss**: Unlike undersampling, SMOTE doesn't discard any majority class examples
2. **Synthetic data generation**: Creates realistic synthetic examples by interpolating between minority class neighbors  
3. **Better generalization**: Often leads to better model performance on unseen data
4. **Preserves data patterns**: Maintains the original data distribution characteristics