In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('Dataset/train.csv', encoding='utf8')
test_df = pd.read_csv('Dataset/test.csv', encoding='utf8')
sample_df = pd.read_csv('Dataset/sample_submission.csv', encoding='utf8')
train_df.shape, test_df.shape, sample_df.shape

((76518, 38), (51012, 37), (51012, 2))

In [3]:
train_df.head(5)

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [4]:
test_df.head(5)

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,76518,1,1,1,9500,1,1,141.0,1,3,...,0,0,8,0,0,0.0,0,13.9,-0.3,0.79
1,76519,1,1,1,9238,1,1,128.0,1,1,...,0,0,6,6,6,13.5,0,11.1,0.6,2.02
2,76520,1,1,1,9238,1,1,118.0,1,1,...,0,0,6,11,5,11.0,0,15.5,2.8,-4.06
3,76521,1,44,1,9147,1,39,130.0,1,1,...,0,3,8,14,5,11.0,0,8.9,1.4,3.51
4,76522,1,39,1,9670,1,1,110.0,1,1,...,0,0,6,9,4,10.666667,2,7.6,2.6,0.32


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [6]:
train_df.Target.value_counts()

Target
Graduate    36282
Dropout     25296
Enrolled    14940
Name: count, dtype: int64

In [7]:
target_mapping = {'Graduate': 0, 'Dropout': 1, 'Enrolled': 2}
train_df.Target = train_df.Target.map(target_mapping)
train_df.head(3)

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,0
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,1
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,1


In [8]:
X = train_df.drop(columns=['Target'])
y = train_df.Target

In [9]:
print(train_df.shape)
print(test_df.shape)

(76518, 38)
(51012, 37)


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head(3)

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
12065,12065,1,44,1,9085,1,39,150.0,1,37,...,0,1,6,21,5,12.777778,0,16.2,0.3,-0.92
17210,17210,1,17,2,9254,1,1,127.0,1,1,...,0,0,6,12,3,11.0,0,15.5,2.8,-4.06
60954,60954,1,1,1,9773,1,1,140.0,1,37,...,0,0,6,6,5,12.2,0,7.6,2.6,0.32


In [11]:
unique_courses = train_df['Course'].unique()

# Create a dictionary mapping each unique course code to a unique integer
course_mapping = {course: idx for idx, course in enumerate(unique_courses)}

# Map the 'Course' column using the course_mapping dictionary
train_df['Course'] = train_df['Course'].map(course_mapping)

# Display the first three rows
train_df['Course'].value_counts()

Course
2     12074
5      8214
0      7935
8      7741
1      5425
4      5373
9      4760
12     4057
6      3733
15     3281
7      3198
11     3004
3      2859
10     2438
13     1606
14      746
16       72
17        1
18        1
Name: count, dtype: int64

In [12]:
from sklearn.preprocessing import RobustScaler,PowerTransformer
scaler = RobustScaler()
pt = PowerTransformer(standardize=False)

In [13]:
X_test_final = test_df

In [14]:
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
X_s = scaler.fit_transform(X)
X_test_final_s = scaler.transform(X_test_final)
X_train_tf = pt.fit_transform(X_train_s)
X_test_tf = pt.transform(X_test_s)
X_tfm = pt.fit_transform(X_s)
X_test_tfm = pt.transform(X_test_final_s)

In [15]:
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import  log_loss

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

In [16]:
xgb = XGBClassifier()
xgb.fit(X_train_tf, y_train)

# Evaluate the final model
y_pred_xg = xgb.predict(X_test_tf)
acc_xg = accuracy_score(y_test, y_pred_xg)
print("Final Accuracy:", acc_xg)

Final Accuracy: 0.832919498170413


In [17]:
lgbm = LGBMClassifier()
lgbm.fit(X_train_tf, y_train)

# Evaluate the final model
y_pred_lg = lgbm.predict(X_test_tf)
acc_lg = accuracy_score(y_test, y_pred_lg)
print("Final Accuracy:", acc_lg)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002582 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1572
[LightGBM] [Info] Number of data points in the train set: 61214, number of used features: 37
[LightGBM] [Info] Start training from score -0.746287
[LightGBM] [Info] Start training from score -1.105333
[LightGBM] [Info] Start training from score -1.635907
Final Accuracy: 0.8320700470465238


In [18]:
rf = RandomForestClassifier()
rf.fit(X_train_tf, y_train)

# Evaluate the final model
y_pred_rf = rf.predict(X_test_tf)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Final Accuracy:", acc_rf)

Final Accuracy: 0.8240329325666492


In [19]:
from sklearn.ensemble import StackingClassifier
estimators = [
    ('RF',rf),
    ('XGB',xgb),
    ('LGBM',lgbm)
]

vote_model = VotingClassifier(estimators=estimators)
vote_model.fit(X_tfm,y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1583
[LightGBM] [Info] Number of data points in the train set: 76518, number of used features: 37
[LightGBM] [Info] Start training from score -0.746204
[LightGBM] [Info] Start training from score -1.106880
[LightGBM] [Info] Start training from score -1.633484


In [20]:
y_pred = vote_model.predict(X_test_tfm)
y_pred

array([1, 0, 0, ..., 1, 1, 1], dtype=int64)

In [21]:
# Reverse mapping dictionary
reverse_mapping = {0: 'Graduate', 1: 'Dropout', 2: 'Enrolled'}

# Transform the y_pred array
y_pred_transformed = np.vectorize(reverse_mapping.get)(y_pred)

# Display the transformed predictions
print(y_pred_transformed)

['Dropout' 'Graduate' 'Graduate' ... 'Dropout' 'Dropout' 'Dropout']


In [23]:
sub = pd.DataFrame({'id': test_df['id'], 'Target': y_pred_transformed})
sub

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Enrolled
4,76522,Enrolled
...,...,...
51007,127525,Dropout
51008,127526,Dropout
51009,127527,Dropout
51010,127528,Dropout


In [24]:
sub.to_csv("Prediction/baseline.csv",index=False)