### Imports

In [1]:
import pandas as pd

dfTrain = pd.read_csv("playground-series-s4e6/train.csv")
dfTest = pd.read_csv("playground-series-s4e6/test.csv")

In [2]:
dfTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [3]:
dfTrain.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [4]:
y_train = dfTrain['Target']
X_train = dfTrain.drop(columns=['Target'])

In [5]:
y_train.unique()

array(['Graduate', 'Dropout', 'Enrolled'], dtype=object)

ok so we have 3 categories: 
- Graduate 
- Dropout 
- Enrolled

now will be encoding it

In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(dfTrain['Target'])
y_train

array([2, 0, 0, ..., 1, 0, 2])

In [37]:
label_mapping = dict(
    zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_mapping

{'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}

**using Logistic Regression (Multinomial)**

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

logisticRegression = make_pipeline(
    StandardScaler(),
    LogisticRegression(multi_class='multinomial', solver='lbfgs')
)

logisticRegression.fit(X_train, y_train)

In [8]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(logisticRegression, X_train, y_train, cv=5)
print("Accuracy:", scores.mean())

Accuracy: 0.8166052662511056


**using Decision Tree Classifier**

In [9]:
from sklearn.tree import DecisionTreeClassifier

dTCModel = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

dTCModel.fit(X_train, y_train)

In [10]:
scores = cross_val_score(dTCModel, X_train, y_train, cv=5)
print("Accuracy:", scores.mean())

Accuracy: 0.6930140086441374


**using Random Forest Classifier**

In [11]:
from sklearn.ensemble import RandomForestClassifier

randomForstClassifier = RandomForestClassifier()
randomForstClassifier.fit(X_train, y_train)

In [12]:
scores = cross_val_score(randomForstClassifier, X_train, y_train, cv=5)
print("Accuracy:", scores.mean())

Accuracy: 0.8248385915194378


**using SVM**

In [13]:
from sklearn.svm import LinearSVC, SVC
from scipy.stats import loguniform, uniform

param_distrib = {
    "svc__gamma": loguniform(0.001, 0.1),
    "svc__C": uniform(1, 10)
}

lin_clf = make_pipeline(
    StandardScaler(),
    LinearSVC(dual=False, random_state=42)
)
lin_clf.fit(X_train, y_train)

In [14]:
scores = cross_val_score(lin_clf, X_train, y_train, cv=5)
print("Accuracy:", scores.mean())

Accuracy: 0.8067252366459583


**using KNN**

In [15]:
from sklearn.neighbors import KNeighborsClassifier

knnModel = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)
knnModel.fit(X_train, y_train)

In [16]:
scores = cross_val_score(knnModel, X_train, y_train, cv=5)
print("Accuracy:", scores.mean())

Accuracy: 0.7751901210464116


**using Gradient Boosting Classifier**

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

gradboost_clf = GradientBoostingClassifier()
gradboost_clf.fit(X_train, y_train)

In [18]:
scores = cross_val_score(gradboost_clf, X_train, y_train, cv=5)
print("Accuracy:", scores.mean())

Accuracy: 0.7687095560768487


In [24]:
from sklearn.model_selection import RandomizedSearchCV

rf_model = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rnd_search_rf = RandomizedSearchCV(rf_model, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

rnd_search_rf.fit(X_train, y_train)
best_rf_model = rnd_search_rf.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   6.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   6.9s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   6.9s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   7.0s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   7.0s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  10.4s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  10.4s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  1

In [26]:
rnd_search_rf.best_score_

0.8267989017729647

In [27]:
dfTest.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,76518,1,1,1,9500,1,1,141.0,1,3,...,0,0,8,0,0,0.0,0,13.9,-0.3,0.79
1,76519,1,1,1,9238,1,1,128.0,1,1,...,0,0,6,6,6,13.5,0,11.1,0.6,2.02
2,76520,1,1,1,9238,1,1,118.0,1,1,...,0,0,6,11,5,11.0,0,15.5,2.8,-4.06
3,76521,1,44,1,9147,1,39,130.0,1,1,...,0,3,8,14,5,11.0,0,8.9,1.4,3.51
4,76522,1,39,1,9670,1,1,110.0,1,1,...,0,0,6,9,4,10.666667,2,7.6,2.6,0.32


In [39]:
test_predictions = rnd_search_rf.predict(dfTest)
test_predictions_labels = label_encoder.inverse_transform(test_predictions)

output_df = pd.DataFrame({
    'id': dfTest['id'], 
    'Target': test_predictions_labels
})

output_file_path = 'playground-series-s4e6/doc/prediction_final.csv'
output_df.to_csv(output_file_path, index=False)