# IMPORTANT NOTICE

## This notebook requires a significant amount of time to run due to the extensive training of multiple models and optimization processes.

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)
raw_data = pd.read_csv('data/student_data.csv')
raw_data

# Data Cleaning and Preprocessing

### Checking for missing values

In [None]:
# Renaming some columns
raw_data.rename(columns={
    'Nacionality': 'Nationality',
    'Daytime/evening attendance\t': 'Daytime/evening attendance'
    }, inplace=True)

In [None]:
raw_data.isnull().sum()

In [None]:
data = raw_data.copy()
data = data[data['Target'] != 'Enrolled']
data['Target'] = data['Target'].map({'Graduate': 1, 'Dropout': 0})
data

In [None]:
data['Target']

### Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Target column

sns.set(rc={'figure.figsize': (10, 5)})
ax = sns.countplot(x=data['Target'])

# Add legend
legend_labels = {'Dropout': 0, 'Graduate': 1}
palette = sns.color_palette()[:len(legend_labels)]
handles = [plt.Rectangle((0, 0), 1, 1, color=palette[i]) for i in range(len(legend_labels))]
ax.legend(handles, legend_labels.keys())

# Show the plot
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(12, 8)})
ax = sns.countplot(x=data['Course'], hue=data['Target'])

# Set legend labels
legend_labels = {0: 'Dropout', 1: 'Graduate'}
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, legend_labels.values())

# Show the plot
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(12, 8)})
ax = sns.countplot(x=data['Curricular units 1st sem (approved)'], hue=data['Target'])

# Set legend labels
legend_labels = {0: 'Dropout', 1: 'Graduate'}
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, legend_labels.values())

# Show the plot
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(12, 8)})
ax = sns.countplot(x=data['Curricular units 2nd sem (approved)'], hue=data['Target'])

# Set legend labels
legend_labels = {0: 'Dropout', 1: 'Graduate'}
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, legend_labels.values())

# Show the plot
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(12, 8)})
ax = sns.countplot(x=data['Daytime/evening attendance'], hue=data['Target'])

# Set legend labels
legend_labels = {0: 'Dropout', 1: 'Graduate'}
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, legend_labels.values())

# Set x-axis tick labels
tick_labels = {0: '0 - Evening', 1: '1 - Daytime'}
plt.xticks(list(tick_labels.keys()), list(tick_labels.values()))

# Show the plot
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(12, 8)})
ax = sns.countplot(x=data['Gender'], hue=data['Target'])

# Set legend labels
legend_labels = {0: 'Dropout', 1: 'Graduate'}
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, legend_labels.values())

# Set x-axis tick labels
tick_labels = {0: '0 - Female', 1: '1 - Male'}
plt.xticks(list(tick_labels.keys()), list(tick_labels.values()))

# Show the plot
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(12, 8)})
ax = sns.countplot(x=data['Scholarship holder'], hue=data['Target'])

# Set legend labels
legend_labels = {0: 'Dropout', 1: 'Graduate'}
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, legend_labels.values())

# Set x-axis tick labels
tick_labels = {0: 'No', 1: 'Yes'}
plt.xticks(list(tick_labels.keys()), list(tick_labels.values()))

# Show the plot
plt.show()

# Correlation

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Compute the correlation matrix between features and the target column
correlation_matrix = data.corr()
correlation_with_target = correlation_matrix['Target'].drop('Target')  # Exclude the target column itself

# Plot the correlations with the target column
plt.figure(figsize=(10, 6))
sns.barplot(x=correlation_with_target.index, y=correlation_with_target.values)
plt.title('Correlation with Target Column')
plt.xlabel('Feature')
plt.ylabel('Correlation')
plt.xticks(rotation=90)
plt.show()

# Discuss the interesting correlation
most_correlated_feature = correlation_with_target.abs().idxmax()
correlation_value = correlation_with_target[most_correlated_feature]
discussion = f"The most interesting correlation is between '{most_correlated_feature}' and the target column with a correlation value of {correlation_value:.2f}.\nThis indicates a strong relationship between these two variables."
print(discussion)

##### Discussion

The most interesting correlation is between 'Curricular units 2nd sem (approved)' and the target column with a correlation value of 0.66. This indicates a strong relationship between the number of curricular units approved in the second semester and the student's graduation or dropout status.

A correlation value of 0.66 suggests a positive association between the number of approved curricular units in the second semester and the likelihood of a student being a graduate rather than a dropout. As the number of approved curricular units in the second semester increases, there is a higher probability of the student being classified as a graduate rather than a dropout.

This correlation implies that students who successfully complete more curricular units in the second semester are more likely to graduate rather than drop out. It indicates that academic performance and progress in the second semester play a significant role in determining the student's outcome.

Additionally, it is worth noting that a similar pattern can be observed for the correlation between the number of curricular units approved in the first semester and the student's graduation or dropout status. Although the correlation value for the first semester may differ, it also suggests a positive association between the number of approved curricular units in the first semester and the likelihood of being a graduate rather than a dropout.

This consistency across both semesters reinforces the importance of early academic success and progress in predicting a student's outcome. It suggests that students who perform well and achieve a higher number of approved curricular units in both the first and second semesters are more likely to graduate. On the other hand, students who struggle or have fewer approved curricular units in either semester may be at a higher risk of dropping out.

# Model training

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
X = data.drop(columns=['Target'], axis=1)
y = data['Target']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Training

##### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_train_pred_rf = rf.predict(X_train)
training_acc_rf = accuracy_score(y_train, y_train_pred_rf)

y_test_pred_rf = rf.predict(X_test)
testing_acc_rf = accuracy_score(y_test, y_test_pred_rf)

In [None]:
# Training accuracy
training_acc_rf

In [None]:
# Testing accuracy
testing_acc_rf

### Visualize RandomForest Correlations

In [None]:
# Create a DataFrame with feature names and importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})

# Sort the DataFrame by feature importances in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Create a bar plot for feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importances - Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

# Print the accuracy scores
print(f'Training Accuracy: {training_acc_rf}')
print(f'Testing Accuracy: {testing_acc_rf}')

In [None]:
from sklearn.model_selection import GridSearchCV

##### GridSearchCV optimization for RandomForest

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10],       # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required to be at a leaf node
}

# Create the GridSearchCV object
grid_search = GridSearchCV(rf, param_grid, cv=5)

# Fit the data to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best estimator
best_rf = grid_search.best_estimator_

# Make predictions on the training and testing sets
y_train_pred_rf_gs = best_rf.predict(X_train)
y_test_pred_rf_gs = best_rf.predict(X_test)

# Calculate accuracy scores
training_acc_rf_gs = accuracy_score(y_train, y_train_pred_rf_gs)
testing_acc_rf_gs = accuracy_score(y_test, y_test_pred_rf_gs)

In [None]:
# Training accuracy
training_acc_rf_gs

In [None]:
# Testing accuracy
testing_acc_rf_gs

### Visualize RandomForest with GridSearchCV Correlations

In [None]:
# Create a DataFrame with feature names and importances for the best estimator
feature_importance_df_gs = pd.DataFrame({'Feature': X_train.columns, 'Importance': best_rf.feature_importances_})

# Sort the DataFrame by feature importances in descending order
feature_importance_df_gs = feature_importance_df_gs.sort_values(by='Importance', ascending=False)

# Create a bar plot for feature importances of the best estimator
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df_gs)
plt.title('Feature Importances - Random Forest (GridSearchCV)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

# Print the accuracy scores
print(f'Training Accuracy (GridSearchCV): {training_acc_rf_gs}')
print(f'Testing Accuracy (GridSearchCV): {testing_acc_rf_gs}')

##### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_train_pred_dt = dt.predict(X_train)
training_acc_dt = accuracy_score(y_train, y_train_pred_dt)

y_test_pred_dt = dt.predict(X_test)
testing_acc_dt = accuracy_score(y_test, y_test_pred_dt)

In [None]:
# Training accuracy
training_acc_dt

In [None]:
# Testing accuracy
testing_acc_dt

##### GridSearchCV optimization for DecisionTreeClassifier

In [None]:
# Define the parameter grid
param_grid = {
    'max_depth': [None, 5, 10],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required to be at a leaf node
}

# Create the GridSearchCV object
grid_search = GridSearchCV(dt, param_grid, cv=5)

# Fit the data to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best estimator
best_dt = grid_search.best_estimator_

# Make predictions on the training and testing sets
y_train_pred_dt_gs = best_dt.predict(X_train)
y_test_pred_dt_gs = best_dt.predict(X_test)

# Calculate accuracy scores
training_acc_dt_gs = accuracy_score(y_train, y_train_pred_dt_gs)
testing_acc_dt_gs = accuracy_score(y_test, y_test_pred_dt_gs)

In [None]:
# Training accuracy
training_acc_dt_gs

In [None]:
# Testing accuracy
testing_acc_dt_gs

##### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_train_pred_lr = lr.predict(X_train)
training_acc_lr = accuracy_score(y_train, y_train_pred_lr)

y_test_pred_lr = lr.predict(X_test)
testing_acc_lr = accuracy_score(y_test, y_test_pred_lr)

In [None]:
# Training accuracy
training_acc_lr

In [None]:
# Testing accuracy
testing_acc_lr

##### GridSearchCV optimization for LogisticRegression

In [None]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],            # Inverse of regularization strength
    'penalty': ['l1', 'l2']       # Regularization penalty
}

# Create the GridSearchCV object
grid_search = GridSearchCV(lr, param_grid, cv=5)

# Fit the data to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best estimator
best_lr = grid_search.best_estimator_

# Make predictions on the training and testing sets
y_train_pred_lr_gs = best_lr.predict(X_train)
y_test_pred_lr_gs = best_lr.predict(X_test)

# Calculate accuracy scores
training_acc_lr_gs = accuracy_score(y_train, y_train_pred_lr_gs)
testing_acc_lr_gs = accuracy_score(y_test, y_test_pred_lr_gs)

In [None]:
# Training accuracy
training_acc_lr_gs

In [None]:
# Testing accuracy
testing_acc_lr_gs

##### Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train, y_train)

y_train_pred_svm = svm.predict(X_train)
training_acc_svm = accuracy_score(y_train, y_train_pred_svm)

y_test_pred_svm = svm.predict(X_test)
testing_acc_svm = accuracy_score(y_test, y_test_pred_svm)

In [None]:
# Training accuracy
training_acc_svm

In [None]:
# Testing accuracy
testing_acc_svm

##### GridSearchCV optimization for Support Vector Machine (SVM)

In [None]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],                  # Regularization parameter
    'kernel': ['linear', 'rbf'],        # Kernel function
    'gamma': ['scale', 'auto']          # Kernel coefficient
}

# Create the GridSearchCV object
grid_search = GridSearchCV(svm, param_grid, cv=5)

# Fit the data to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best estimator
best_svm = grid_search.best_estimator_

# Make predictions on the training and testing sets
y_train_pred_svm_gs = best_svm.predict(X_train)
y_test_pred_svm_gs = best_svm.predict(X_test)

# Calculate accuracy scores
training_acc_svm_gs = accuracy_score(y_train, y_train_pred_svm_gs)
testing_acc_svm_gs = accuracy_score(y_test, y_test_pred_svm_gs)

In [None]:
# Training accuracy
training_acc_svm_gs

In [None]:
# Testing accuracy
testing_acc_svm_gs

### Save Scores in DataFrame

In [None]:
# Create a dictionary with the accuracy scores
accuracy_scores = {
    'Model': ['RandomForest', 'RandomForestGridSearch', 'DecisionTree', 'DecisionTreeGridSearch', 
              'LogisticRegression', 'LogisticRegressionGridSearch', 'SVM', 'SVMGridSearch'],
    'Training Accuracy': [training_acc_rf, training_acc_rf_gs, training_acc_dt, training_acc_dt_gs,
                           training_acc_lr, training_acc_lr_gs, training_acc_svm, training_acc_svm_gs],
    'Testing Accuracy': [testing_acc_rf, testing_acc_rf_gs, testing_acc_dt, testing_acc_dt_gs,
                          testing_acc_lr, testing_acc_lr_gs, testing_acc_svm, testing_acc_svm_gs]
}

# Create a DataFrame from the accuracy scores dictionary
accuracy_df = pd.DataFrame(accuracy_scores)

# Display the accuracy DataFrame
print(accuracy_df)

### Graph Model Performances

In [None]:
import matplotlib.pyplot as plt

# Accuracy scores for each model
models = ['RandomForest', 'RandomForestGridSearch', 'DecisionTree', 'DecisionTreeGridSearch',
          'LogisticRegression', 'LogisticRegressionGridSearch', 'SVM', 'SVMGridSearch']
training_acc = [training_acc_rf, training_acc_rf_gs, training_acc_dt, training_acc_dt_gs,
                training_acc_lr, training_acc_lr_gs, training_acc_svm, training_acc_svm_gs]
testing_acc = [testing_acc_rf, testing_acc_rf_gs, testing_acc_dt, testing_acc_dt_gs,
               testing_acc_lr, testing_acc_lr_gs, testing_acc_svm, testing_acc_svm_gs]

# Plotting the performance
plt.figure(figsize=(10, 6))
plt.bar(models, training_acc, label='Training Accuracy')
plt.bar(models, testing_acc, label='Testing Accuracy')
plt.title('Model Performance - Training vs Testing Accuracy')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim([0, 1.1])
plt.xticks(rotation=45)
plt.legend()
plt.show()


# Additional RandomForest (GridSearchCV) Validation

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix

In [None]:
# Perform k-fold cross-validation
cv_scores = cross_val_score(best_rf, X, y, cv=5)

# Calculate mean and standard deviation of accuracy scores
mean_accuracy = cv_scores.mean()
std_accuracy = cv_scores.std()

In [None]:
# Get cross-validated predictions
cv_predictions = cross_val_predict(best_rf, X, y, cv=10)

In [None]:
# Calculate confusion matrix
confusion_mat = confusion_matrix(y, cv_predictions)

# Display the confusion matrix
print("Confusion Matrix:")
print(confusion_mat)

In [None]:
print("Mean Accuracy: {:.2f}".format(mean_accuracy))
print("Standard Deviation of Accuracy: {:.6f}".format(std_accuracy))

# Save Model

In [None]:
import pickle
import os

# Create the "out" folder if it doesn't exist
os.makedirs("out", exist_ok=True)

# Define the file path
file_path = "out/predict_student_success.pkl"

# Save the model using pickle
with open(file_path, 'wb') as f:
    pickle.dump(best_rf, f)