In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest, f_classif

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("sample_submission.csv")
train_df

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.000000,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.000000,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.820000,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,76513,1,17,1,9254,1,1,121.0,1,19,...,0,6,8,5,10.600000,0,13.9,-0.3,0.79,Graduate
76514,76514,1,1,6,9254,1,1,125.0,1,1,...,0,6,9,6,13.875000,0,9.4,-0.8,-3.12,Graduate
76515,76515,5,17,1,9085,1,1,138.0,1,37,...,0,5,8,5,11.400000,1,9.4,-0.8,-3.12,Enrolled
76516,76516,1,1,3,9070,1,1,136.0,1,38,...,0,6,0,0,0.000000,0,7.6,2.6,0.32,Dropout


In [4]:
le = LabelEncoder()
for column in train_df.select_dtypes(include=['object']).columns:
    train_df[column] = le.fit_transform(train_df[column])
for column in test_df.select_dtypes(include=['object']).columns:
    test_df[column] = le.transform(test_df[column])

In [5]:
target_le = LabelEncoder()
train_df['Target'] = target_le.fit_transform(train_df['Target'])

In [6]:
train_df['Target']

0        2
1        0
2        0
3        1
4        2
        ..
76513    2
76514    2
76515    1
76516    0
76517    2
Name: Target, Length: 76518, dtype: int64

In [7]:
X = train_df.drop(columns=['Target', 'id'])
y = train_df['Target']
X_test = test_df.drop(columns=['id'])

In [11]:
from imblearn.over_sampling import SMOTE

In [12]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [13]:
selector = SelectKBest(f_classif, k=20)
X_resampled_selected = selector.fit_transform(X_resampled, y_resampled)
X_test_selected = selector.transform(X_test)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled_selected, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [15]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_clf.fit(X_train, y_train)

In [16]:
y_val_pred = rf_clf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

Validation Accuracy: 0.8242994947175012
              precision    recall  f1-score   support

           0       0.92      0.78      0.85      7256
           1       0.74      0.82      0.78      7257
           2       0.83      0.88      0.85      7257

    accuracy                           0.82     21770
   macro avg       0.83      0.82      0.83     21770
weighted avg       0.83      0.82      0.83     21770



In [17]:
rf_clf.fit(X_resampled_selected, y_resampled)

In [18]:
test_preds = rf_clf.predict(X_test_selected)
test_preds_labels = target_le.inverse_transform(test_preds)

In [19]:
num_to_label = {0: "Dropout", 1: "Enrolled", 2: "Graduate"}

In [20]:
test_preds_labels = [num_to_label[pred] for pred in test_preds]

In [21]:
submission_df['Target'] = test_preds_labels

In [22]:
submission_df

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Enrolled
4,76522,Enrolled
...,...,...
51007,127525,Dropout
51008,127526,Dropout
51009,127527,Dropout
51010,127528,Dropout


In [23]:
submission_df.to_csv('Anmol_submission1.csv', index=False)