In [99]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
import numpy as np

In [100]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Assuming the file 'dropout.csv' is in the same directory as the notebook
# and has the structure as described by the user
# Read the dataset
dropout_data = pd.read_csv("dropout.csv", delimiter=';')

# The user wants to apply OneHotEncoding to the categorical features
categorical_features = ['Marital status',
                        'Application mode', 
                        'Course',
                        'Daytime/evening attendance\t', 
                        'Previous qualification',
                        'Displaced',
                        'Debtor', 
                        'Tuition fees up to date', 
                        'Gender', 
                        'Scholarship holder']

numeric_features = ['Application order', 
                    'Age at enrollment', 
                    'Admission grade',
                    'Previous qualification (grade)',
                    'Curricular units 1st sem (credited)',
                    'Curricular units 1st sem (enrolled)',
                    'Curricular units 1st sem (evaluations)',
                    'Curricular units 1st sem (approved)',
                    'Curricular units 1st sem (grade)',
                    'Curricular units 2nd sem (credited)',
                    'Curricular units 2nd sem (enrolled)',
                    'Curricular units 2nd sem (evaluations)',
                    'Curricular units 2nd sem (approved)',
                    'Curricular units 2nd sem (grade)', 
                    'GDP']

# Convert the 'Target' column into numeric values
label_encoder = LabelEncoder()
dropout_data['Target'] = label_encoder.fit_transform(dropout_data['Target'])

# Selecting features
X = dropout_data[categorical_features + numeric_features]
y = dropout_data['Target']
# print(X)

# Creating the preprocessor for one hot encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split the data into training and testing sets
preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Apply transformations
# X = preprocessor.fit_transform(X)
# X_test = preprocessor.transform(X_test)

# Checking the transformed features
X_train.shape, X_test.shape


((3539, 25), (885, 25))

In [101]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE,SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
import numpy as np
print("Original Full Dataset:", Counter(y))
print("Original Dataset:", Counter(y_train))
over = SMOTENC(categorical_features=list(range(len(categorical_features))), sampling_strategy='auto',random_state=21)
pipeline = Pipeline(steps=[('o', over)])
X_train, y_train = pipeline.fit_resample(X_train, y_train)

X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
print("Balanced Dataset:", Counter(y_train))

X_train.shape, X_test.shape

Original Full Dataset: Counter({2: 2209, 0: 1421, 1: 794})
Original Dataset: Counter({2: 1759, 0: 1137, 1: 643})
Balanced Dataset: Counter({2: 1759, 1: 1759, 0: 1759})


((5277, 85), (885, 85))

In [102]:
import pandas as pd
# import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import VotingClassifier

In [103]:
import warnings
warnings.filterwarnings("ignore")
arr = []
for n_estimators in [100,150,175,200]:
    for max_depth in [3,5,None]:
        for learning_rate in [0.3,0.4]:
            m = XGBClassifier(learning_rate = learning_rate,n_estimators=n_estimators,max_depth=max_depth, tree_method = "hist", device = "cuda")
            m.fit(X_train,y_train)
            y_pred = m.predict(X_test)
            # print(accuracy_score(y_test,y_pred))
            arr.append(((n_estimators,max_depth,learning_rate),accuracy_score(y_test,y_pred)))
print(max(arr,key= lambda x:x[1]))

((100, 5, 0.4), 0.7875706214689265)


In [104]:
rfc = RandomForestClassifier(n_estimators=175,random_state=48)
xbc = XGBClassifier(learning_rate = 0.4,n_estimators=100,max_depth=5, tree_method = "hist", device = "cuda")
ens = VotingClassifier(estimators=[('rfc', rfc), ('xbc',xbc)], voting='soft')

In [105]:
ens.fit(X_train, y_train)

In [106]:
y_pred = ens.predict(X_test)
print("Voting Classifier Accuracy\t\t :",round(accuracy_score(y_test,y_pred)*100,5),"%")
print(classification_report(y_test,y_pred,target_names=['Dropout','Enrolled','Graduate']))

Voting Classifier Accuracy		 : 79.20904 %
              precision    recall  f1-score   support

     Dropout       0.84      0.77      0.80       284
    Enrolled       0.52      0.55      0.53       151
    Graduate       0.86      0.89      0.87       450

    accuracy                           0.79       885
   macro avg       0.74      0.74      0.74       885
weighted avg       0.79      0.79      0.79       885



In [107]:
from sklearn.model_selection import KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from collections import Counter
import pickle
from sklearn.metrics import confusion_matrix
import numpy as np

kf = KFold(n_splits=10)
kf.get_n_splits(X)
VCAcc = []
rfcAcc = []
xbcAcc = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i+1}:")
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]

    over = SMOTENC(categorical_features=list(range(len(categorical_features))), sampling_strategy='auto',random_state=42)
    pipeline = Pipeline(steps=[('o', over)])
    X_train, y_train = pipeline.fit_resample(X_train, y_train)

    X_train = preprocessor.transform(X_train)
    X_test = preprocessor.transform(X_test)
    
    rfc = RandomForestClassifier(n_estimators=175,random_state=48)
    xbc = XGBClassifier(learning_rate = 0.4,n_estimators=100,max_depth=5, tree_method = "hist", device = "cuda")
    VC = VotingClassifier(estimators=[('rfc', rfc), ('xbc',xbc)], voting='soft')

    VC.fit(X_train, y_train)
    y_pred = VC.predict(X_test)
    print("Voting Classifier Accuracy :",round(accuracy_score(y_test,y_pred)*100,2),"%")
    VCAcc.append(round(accuracy_score(y_test,y_pred)*100,2))

print(f"Voting Classifier Accuracy:{VCAcc} Average: {sum(VCAcc)/len(VCAcc)}")

Fold 1:
Voting Classifier Accuracy : 77.88 %
Fold 2:
Voting Classifier Accuracy : 74.49 %
Fold 3:
Voting Classifier Accuracy : 80.36 %
Fold 4:
Voting Classifier Accuracy : 77.65 %
Fold 5:
Voting Classifier Accuracy : 80.09 %
Fold 6:
Voting Classifier Accuracy : 74.66 %
Fold 7:
Voting Classifier Accuracy : 75.79 %
Fold 8:
Voting Classifier Accuracy : 79.86 %
Fold 9:
Voting Classifier Accuracy : 74.89 %
Fold 10:
Voting Classifier Accuracy : 74.66 %
Voting Classifier Accuracy:[77.88, 74.49, 80.36, 77.65, 80.09, 74.66, 75.79, 79.86, 74.89, 74.66] Average: 77.03299999999999
