In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'higher-education-predictors-of-student-retention:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2780494%2F4802354%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241011%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241011T101308Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3ccd00405fd0e26b23ff2784eb85c0a5de8a417653c4f16a0f57bbc426e975ba7b58a4b8903299eb9f7aa470ccba6d1dfcb801a1442f7bcdf39d0696d99bd2ec61bb624b5ba1db297d17bdd2e4449cbd08502094754d02da59bc4db96c37102085ec3ec4527010a354badf5645cd65ac51a37f72d9a9c3b290a200351b8970572bf6cce5508d14937ead34e59f474568a7c35c06cde422d0cd73e161cc30f234c08bccd88d6ec79b4edd0b99f361981197e25a57a7234661dc7008249462d5c8ebc768d9dd03f31999b582887192b5372194469916fe78f769c396ddceded69bef01c554728f09cf84f4a1a2d93536a1173d3a0d633a18c310feb4d31b597b37'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading higher-education-predictors-of-student-retention, 89332 bytes compressed
Data source import complete.


# Story Telling

In a bustling city, a vibrant university attracted diverse students, each with unique backgrounds and challenges. A rich dataset captured their stories, highlighting factors like marital status, application modes, and courses chosen, reflecting their aspirations. This data revealed patterns of success and struggle, particularly showing that single and evening students faced more obstacles. Insights from the dataset guided the university to create tailored support systems, such as mentorship and flexible schedules. Ultimately, this transformed the data into a living narrative, empowering students in their pursuit of education and dreams.

# Importing

In [2]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
import json
import os
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/kaggle/input/higher-education-predictors-of-student-retention/dataset.csv')
df.info()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/higher-education-predictors-of-student-retention/dataset.csv'

# EDA

In [None]:
df.head()

## Data Quality Assesment

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
for col in df.columns:
    print(f"Column {col} value counts percentage : ")
    print((df[col].value_counts())/len(df))
    print(f"Total value counts Percentage in {col} : ")
    print((df[col].value_counts().sum())/len(df))

**The data looks pretty fine without missing values so far.**

In [None]:
df.describe().T

## Data Visulaizing

In [None]:
num_cols = df.select_dtypes(include=np.number).columns
numerical_data = df[num_cols]

In [None]:
 # Set the size of the heatmap
plt.figure(figsize=(32, 28))
# Create a heatmap
sns.heatmap(numerical_data.corr(), annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
# Set titles and labels
plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.tight_layout()  # Adjust layout to prevent clipping of tick-labels
# Show the plot
plt.show()

In [None]:
for col in numerical_data[1:]:
    plt.figure(figsize=(15, 5))

    # Histogram
    plt.subplot(1, 3, 1)
    df[col].plot.hist(bins=30, color='skyblue', edgecolor='black')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.title(f'Histogram of {col}')

    # Boxplot
    plt.subplot(1, 3, 2)
    sns.boxplot(x=df[col],color='skyblue', width=0.5)
    plt.xlabel(col)
    plt.ylabel('Value')
    plt.title(f'Boxplot of {col}')

    # KDE plot
    plt.subplot(1, 3, 3)
    sns.kdeplot(df[col], color='skyblue', fill=True)
    plt.xlabel(col)
    plt.ylabel('Density')
    plt.title(f'KDE of {col}')

    plt.tight_layout()
    plt.show()

In [None]:
labels = df['Target'].value_counts().index
values = df['Target'].value_counts().values

plt.pie(values, labels = labels, colors = ['lightsalmon', 'skyblue', 'wheat'], autopct = '%1.0f%%')
plt.title('Proportion of the Labels');

In [None]:
# Target by Marital Status
fig = px.histogram(df, x='Marital status', color='Target', barmode='group', nbins=20, title='Target by Marital Status')
fig.update_layout(xaxis_title='Marital Status', yaxis_title='Count')
fig.show()

In [None]:
# Target by Curricular units 1st sem (grade)
fig = px.histogram(df, x='Curricular units 1st sem (grade)', color='Target', barmode='group', nbins=20, title='Target by 1st Sem Grades')
fig.update_layout(xaxis_title='Curricular Units 1st Sem (Grade)', yaxis_title='Count')
fig.show()

In [None]:
# Target by Previous qualification
fig = px.histogram(df, x='Previous qualification', color='Target', barmode='group', nbins=20, title='Target by Previous qualification')
fig.update_layout(xaxis_title='Previous qualification', yaxis_title='Count')
fig.show()

In [None]:
# Target by Gender
fig = px.histogram(df, x='Gender', color='Target', barmode='group', nbins=20, title='Dropout by Gender')
fig.update_layout(xaxis_title='Gender', yaxis_title='Count')
fig.show()

In [None]:
# Target by Scholarship holder
fig = px.histogram(df, x='Scholarship holder', color='Target', barmode='group', nbins=20, title='Dropout by Scholarship holder')
fig.update_layout(xaxis_title='Scholarship holder', yaxis_title='Count')
fig.show()

# Data Preprocessing

## Encoding

In [None]:
# Mapping qualifications
unique_values = set(df['Previous qualification'].unique().tolist() +
                    df["Mother's qualification"].unique().tolist() +
                    df["Father's qualification"].unique().tolist())

# Create the mapping
value_to_index = {value: idx for idx, value in enumerate(unique_values)}

# Apply the mapping to the relevant columns
df['Previous qualification'] = df['Previous qualification'].map(value_to_index)
df["Mother's qualification"] = df["Mother's qualification"].map(value_to_index)
df["Father's qualification"] = df["Father's qualification"].map(value_to_index)

In [None]:
# Mapping occupations
unique_values = set(df["Mother's occupation"].unique().tolist() +
                    df["Father's occupation"].unique().tolist())

# Create the mapping
value_to_index = {value: idx for idx, value in enumerate(unique_values)}

# Apply the mapping to the relevant columns
df["Mother's occupation"] = df["Mother's occupation"].map(value_to_index)
df["Father's occupation"] = df["Father's occupation"].map(value_to_index)

## Column Reduction

In [None]:
df = df.drop(['Nacionality', 'International', 'Educational special needs'], axis = 1)

In [None]:
df['I/U Ratio']=df['Inflation rate']/df['Unemployment rate']
df.drop(columns=['Inflation rate','Unemployment rate'],inplace=True)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
df['Target'] = OrdinalEncoder(categories = [['Dropout', 'Enrolled', 'Graduate']]).fit_transform(df[['Target']])

In [None]:
plt.figure(figsize = (15, 15))
plt.rcParams.update({'font.size': 8})
sns.heatmap(df.drop('Target', axis = 1).corr(method = 'spearman'), vmin = -1, vmax = 1, center = 0,
            cmap = 'coolwarm', fmt = '.2f', linewidths = .2, annot = True);

In [None]:
df['Curricular units avg credited'] = df[['Curricular units 1st sem (credited)', 'Curricular units 2nd sem (credited)']].mean(axis = 1)
df['Curricular avg enrolled'] = df[['Curricular units 1st sem (enrolled)', 'Curricular units 2nd sem (enrolled)']].mean(axis=1)
df['Curricular avg evaluations'] = df[['Curricular units 1st sem (evaluations)', 'Curricular units 2nd sem (evaluations)']].mean(axis=1)
df['Curricular avg approved'] = df[['Curricular units 1st sem (approved)', 'Curricular units 2nd sem (approved)']].mean(axis=1)
df['Curricular avg grade'] = df[['Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)']].mean(axis=1)
df['Curricular avg without evaluations'] = df[['Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (without evaluations)']].mean(axis=1)

In [None]:
num_features = df[['Age at enrollment', 'Curricular units avg credited', 'Curricular avg enrolled',
                              'Curricular avg evaluations', 'Curricular avg approved',
                              'Curricular avg grade', 'Curricular avg without evaluations', 'I/U Ratio',
                              'GDP', 'Target']]

plt.figure(figsize = (10, 8))
plt.rcParams.update({'font.size': 8})
sns.heatmap(num_features.corr(method = 'spearman'), vmin = -1, vmax = 1, center = 0,
            cmap = 'coolwarm', fmt = '.2f', linewidths = .2, annot = True);

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows = 1, ncols = 4, figsize = (16, 4))
sns.boxplot(data = df, x = 'Target', y = 'Curricular avg approved',
            color = 'lightgreen', ax = ax1)
ax1.set_title('Average Approved Curricular Units')
ax1.set_xlabel("")
ax1.set_xticks([0, 1, 2])
ax1.set_xticklabels(['Drop Out', 'Enrolled', 'Graduate']);

sns.boxplot(data = df, x = 'Target', y = 'Curricular avg grade',
            color = 'lightsalmon', ax = ax2)
ax2.set_title('Average Grade of Curricular Units')
ax2.set_xlabel("")
ax2.set_xticks([0, 1, 2])
ax2.set_xticklabels(['Drop Out', 'Enrolled', 'Graduate']);


sns.boxplot(data = df, x = 'Target', y = 'Curricular avg enrolled',
            color = 'lightsalmon', ax = ax3)
ax2.set_title('Average Grade of Curricular Units')
ax2.set_xlabel("")
ax2.set_xticks([0, 1, 2])
ax2.set_xticklabels(['Drop Out', 'Enrolled', 'Graduate']);

sns.boxplot(data = df, x = 'Target', y = 'Age at enrollment',
            color = 'lightsalmon', ax = ax4)
ax2.set_title('Average Grade of Curricular Units')
ax2.set_xlabel("")
ax2.set_xticks([0, 1, 2])
ax2.set_xticklabels(['Drop Out', 'Enrolled', 'Graduate']);

## Outliars

In [None]:
# Outliar: The graduaded students shouldn't have 0 credits
df = df.drop(df.loc[(df['Curricular avg approved'] == 0) & (df['Target'] == 2)].index)
df = df.drop(df.loc[(df['Curricular avg enrolled'] == 0) & (df['Target'] == 2)].index)
df = df.drop(df.loc[(df['Curricular avg grade'] == 0) & (df['Target'] == 2)].index)

## Skewness

In [None]:
skewed_features = {}
skewed_columns = []

for feature in df.columns:
    skewness = df[feature].skew()
    skewed_features[feature] = skewness
    if skewness > 0.5:
        print(f"{feature} is right skewed with skewness: {skewness}")

for feature in df.columns:
    if skewed_features[feature] > 0.5:
        skewed_columns.append(feature)
        df[feature] = np.log1p(df[feature])

print("Log transformation applied to right-skewed features.")

# Modeling

In [None]:
Conclusions = []
Conclusions.append(['Model Name', 'Train','Validation' , 'Test'])

def Evaluate(model_name, y_train_pred, y_val_pred, y_test_pred):
  accuracy_train = accuracy_score(y_train, y_train_pred)
  accuracy_val = accuracy_score(y_val, y_val_pred)
  accuracy_test = accuracy_score(y_test, y_test_pred)

  # Accuracy Scores
  print(f"Train Accuracy: {accuracy_train}")
  print(f"Validation Accuracy: {accuracy_val}")
  print(f"Test Accuracy: {accuracy_test}")

  # Classification Reports
  print("=================================================================================================")
  print("Classification Report for Train Set:\n", classification_report(y_train, y_train_pred, zero_division=0))
  print("=================================================================================================")
  print("Classification Report for Validation Set:\n", classification_report(y_val, y_val_pred, zero_division=0))
  print("=================================================================================================")
  print("Classification Report for Test Set:\n", classification_report(y_test, y_test_pred, zero_division=0))

  return [model_name, accuracy_train, accuracy_val, accuracy_test]

## Split and Scall

In [None]:
X = df.drop('Target', axis=1)
y = df['Target']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

y_train = y_train.replace([0, 2], [1, 0])
y_val = y_val.replace([0, 2], [1, 0])
y_test = y_test.replace([0, 2], [1, 0])

from sklearn.preprocessing import StandardScaler , RobustScaler
Standard = StandardScaler()

X_train = pd.DataFrame(Standard.fit_transform(X_train),columns=X_train.columns)
X_val = pd.DataFrame(Standard.fit_transform(X_val),columns=X_train.columns)
X_test = pd.DataFrame(Standard.fit_transform(X_test),columns=X_train.columns)

## Logistic Regression

In [None]:
log_reg_model = LogisticRegression(max_iter=10000)
log_reg_model.fit(X_train, y_train)

y_train_pred = log_reg_model.predict(X_train)
y_val_pred = log_reg_model.predict(X_val)
y_test_pred = log_reg_model.predict(X_test)

Conclusions.append(Evaluate('Logistic Regression', y_train_pred, y_val_pred, y_test_pred))

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

## KNN

In [None]:
from sklearn.metrics import classification_report

knn_model = KNeighborsClassifier(n_neighbors=11)
knn_model.fit(X_train, y_train)

y_train_pred = knn_model.predict(X_train)
y_val_pred = knn_model.predict(X_val)
y_test_pred = knn_model.predict(X_test)

Conclusions.append( Evaluate('KNN', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

## SVM

In [None]:
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear', random_state=42)

svm_classifier.fit(X_train, y_train.values.ravel())

y_train_pred = svm_classifier.predict(X_train)
y_val_pred = svm_classifier.predict(X_val)
y_test_pred = svm_classifier.predict(X_test)

Conclusions.append( Evaluate('SVM', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

y_train_pred = rf_clf.predict(X_train)
y_val_pred = rf_clf.predict(X_val)
y_test_pred = rf_clf.predict(X_test)

Conclusions.append( Evaluate('Random Forest', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

## Naive Bayes

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB

nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train.values.ravel())

y_train_pred = nb_classifier.predict(X_train)
y_val_pred = nb_classifier.predict(X_val)
y_test_pred = nb_classifier.predict(X_test)

Conclusions.append( Evaluate('Naive Bayes', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

## Decision Tree

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)
y_test_pred = clf.predict(X_test)

Conclusions.append( Evaluate('Decision Tree', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

## AdaBoost Classifier

In [None]:
n_estimators = 100
clf = AdaBoostClassifier(n_estimators=n_estimators, algorithm="SAMME", random_state=0)
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)
y_test_pred = clf.predict(X_test)

Conclusions.append( Evaluate('AdaBoost Classifier', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

### AdaBoost Classifier with Grid Search

In [None]:
clf = AdaBoostClassifier(random_state=42)

parameters_grid = {
    'n_estimators': [50, 100, 200],
    'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(clf, parameters_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_ada_model = grid_search.best_estimator_
print(f'Best AdaBoost Classifier parameters: {grid_search.best_params_}')

y_train_pred = best_ada_model.predict(X_train)
y_val_pred = best_ada_model.predict(X_val)
y_test_pred = best_ada_model.predict(X_test)

Conclusions.append( Evaluate('AdaBoost Classifier with Grid Search', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

## Gradient Boosting Classifier

In [None]:
n_estimators = 100
max_depth = 3

clf = GradientBoostingClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
clf.fit(X_train, y_train)

y_pred_gb_test = clf.predict(X_test)
y_pred_gb_train = clf.predict(X_train)

Conclusions.append( Evaluate('Gradient Boosting Classifier', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

### Gradient Boost Classifier with Grid Search

In [None]:
clf = GradientBoostingClassifier(random_state=0)

parameters_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(clf, parameters_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_gb_model = grid_search.best_estimator_
print(f'Best Gradient Boosting parameters: {grid_search.best_params_}')

y_test_pred = best_gb_model.predict(X_test)
y_train_pred = best_gb_model.predict(X_train)
y_val_pred = best_gb_model.predict(X_val)

Conclusions.append( Evaluate('GradientBoost Classifier with Grid Search', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

## XGB Classifier

In [None]:
n_estimators = 100
max_depth = 3
gamma = 0
learning_rate = 0.1

model = xgb.XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, gamma=gamma, random_state=42)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
y_val_pred = model.predict(X_val)

Conclusions.append( Evaluate('XGBoost Classifier', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

### XGB Classifier with Grid Search

In [None]:
model = xgb.XGBClassifier(random_state=42)

paramters_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2]
}

grid_search = GridSearchCV(model, paramters_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_xgb_model = grid_search.best_estimator_
print(f'Best XGBoost parameters: {grid_search.best_params_}')

y_train_pred = best_xgb_model.predict(X_train)
y_test_pred = best_xgb_model.predict(X_test)
y_val_pred = best_xgb_model.predict(X_val)

Conclusions.append( Evaluate('XGBoost Classifier with Grid Search', y_train_pred, y_val_pred, y_test_pred) )

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, cmap='Blues', annot=True, fmt='.0f')

In [None]:
model_names = [row[0] for row in Conclusions[1:]]  # Assuming the first row is a header row and skip it

# Extract other data similarly
train_accuracies = [row[1] for row in Conclusions[1:]]
val_accuracies = [row[2] for row in Conclusions[1:]]
test_accuracies = [row[3] for row in Conclusions[1:]]

# Create a bar plot
plt.figure(figsize=(12, 8))
plt.bar(model_names, train_accuracies, label='Train Accuracy')
plt.bar(model_names, val_accuracies, label='Validation Accuracy')
plt.bar(model_names, test_accuracies, label='Test Accuracy')

# Add labels and title
plt.xlabel('Model Name')
plt.ylabel('Accuracy')
plt.title('Model Performance Comparison')
plt.legend()

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Show the plot
plt.show()

# Clustring

In [None]:
Robust= RobustScaler()
X_train_Robust = pd.DataFrame(Robust.fit_transform(X_train),columns=X_train.columns)
X_val_Robust = pd.DataFrame(Robust.fit_transform(X_val),columns=X_train.columns)
X_test_Robust = pd.DataFrame(Robust.fit_transform(X_test),columns=X_train.columns)

## KMeans

In [None]:
from sklearn.cluster import KMeans
# Kmeans With Standard PCA
iner=[]
for k in range(2,20):
  kmeans = KMeans(n_clusters=k)
  kmeans.fit(X_train)
  iner.append(kmeans.inertia_)
plt.style.use("fivethirtyeight")
plt.plot(range(2,20),iner)
plt.xticks(range(2,20))
plt.xlabel('number of clusters')
plt.ylabel('inertia')
plt.show()

In [None]:
# Kmeans With Robust
iner=[]
for k in range(2,20):
  kmeans = KMeans(n_clusters=k)
  kmeans.fit(X_train_Robust)
  iner.append(kmeans.inertia_)
plt.style.use("fivethirtyeight")
plt.plot(range(2,20),iner)
plt.xticks(range(2,20))
plt.xlabel('number of clusters')
plt.ylabel('inertia')
plt.show()

In [None]:
!pip install kneed
from kneed import KneeLocator
k1=KneeLocator(range(2,20),iner , curve='convex', direction= 'decreasing')
k1.elbow

plt.style.use("fivethirtyeight")
plt.plot(range(2,20),iner)
plt.xticks(range(2,20))
plt.xlabel('number of clusters')
plt.ylabel('List')
plt.axvline(x=k1.elbow, color='b', label= 'axvline-full height', ls= '--')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
silhouette_coefficients =[]
for k in range(2,20):#1 is the worse
    kmeans=KMeans(n_clusters=k)
    kmeans.fit(X_train_Robust)
    score= silhouette_score(X_train_Robust, kmeans.labels_)
    silhouette_coefficients.append(score)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2,20),silhouette_coefficients)
plt.xticks(range(2,20))
plt.xlabel('number of clusters')
plt.ylabel('silhouette coefficients')
plt.show()

In [None]:
kmeans= KMeans(n_clusters=6)
y_kmeans= pd.DataFrame(kmeans.fit_predict(X_train_Robust))
centroids=pd.DataFrame(kmeans.cluster_centers_)

In [None]:
from sklearn.decomposition import PCA
pca3=PCA(n_components=3)
Clusterd_data=pd.DataFrame(pca3.fit_transform(X_train_Robust),columns=['Feature 1','Feature 2','Feature 3'])
Clusterd_data['Label']=y_kmeans

In [None]:
import plotly.express as px
plt.figure(figsize=(8, 8))
fig=px.scatter_3d(Clusterd_data,x='Feature 1', y='Feature 2',z='Feature 3', color='Label',title='Kmeans' )
fig.show()

## Ward Linkage

In [None]:
from sklearn.cluster import AgglomerativeClustering

silhouette_coefficients =[]
for k in range(2,20):#1 is the worse
    agg_clustering = AgglomerativeClustering(n_clusters=k)
    agg_clustering.fit(X_train_Robust)
    score= silhouette_score(X_train_Robust, agg_clustering.labels_)
    silhouette_coefficients.append(score)
plt.style.use("fivethirtyeight")
plt.plot(range(2,20),silhouette_coefficients)
plt.xticks(range(2,20))
plt.xlabel('number of clusters')
plt.ylabel('silhouette coefficients')
plt.show()

In [None]:
from sklearn.manifold import TSNE

agg_clustering = AgglomerativeClustering(n_clusters=4)
labels = pd.DataFrame(agg_clustering.fit_predict(X_train_Robust))
Agg=X_train_Robust.copy()
tsne = TSNE(n_components=3, random_state=42)
Agg=pd.DataFrame(tsne.fit_transform(Agg),columns=['Feature 1','Feature 2','Feature 3'])
Agg['Labels']=labels

In [None]:
plt.figure(figsize=(8, 8))
px.scatter_3d(Agg,x='Feature 1', y='Feature 2',z='Feature 3', color='Labels',title='AGG -Ward' )

## Average Linkage

In [None]:
silhouette_coefficients =[]
for k in range(2,20):#1 is the worse
    agg_clustering = AgglomerativeClustering(n_clusters=k,linkage='average')
    agg_clustering.fit(X_train_Robust)
    score= silhouette_score(X_train_Robust, agg_clustering.labels_)
    silhouette_coefficients.append(score)
plt.style.use("fivethirtyeight")
plt.plot(range(2,20),silhouette_coefficients)
plt.xticks(range(2,20))
plt.xlabel('number of clusters')
plt.ylabel('silhouette coefficients')
plt.show()

In [None]:
agg_clustering = AgglomerativeClustering(n_clusters=3,linkage='average')
labels = pd.DataFrame(agg_clustering.fit_predict(X_train_Robust))
Agg=X_train_Robust.copy()
tsne = TSNE(n_components=3, random_state=42)
Agg=pd.DataFrame(tsne.fit_transform(Agg),columns=['Feature 1','Feature 2','Feature 3'])
Agg['Labels']=labels

In [None]:
plt.figure(figsize=(8, 8))
px.scatter_3d(Agg,x='Feature 1', y='Feature 2',z='Feature 3', color='Labels',title='AGG - Average' )

## Complete Linkage

In [None]:
silhouette_coefficients =[]
for k in range(2,20):
    agg_clustering = AgglomerativeClustering(n_clusters=k,linkage='complete')
    agg_clustering.fit(X_train_Robust)
    score= silhouette_score(X_train_Robust, agg_clustering.labels_)
    silhouette_coefficients.append(score)
plt.style.use("fivethirtyeight")
plt.plot(range(2,20),silhouette_coefficients)
plt.xticks(range(2,20))
plt.xlabel('number of clusters')
plt.ylabel('silhouette coefficients')
plt.show()

In [None]:
agg_clustering = AgglomerativeClustering(n_clusters=7,linkage='complete')
labels = pd.DataFrame(agg_clustering.fit_predict(X_train_Robust))
Agg=X_train_Robust.copy()
tsne = TSNE(n_components=3, random_state=42)
Agg=pd.DataFrame(tsne.fit_transform(Agg),columns=['Feature 1','Feature 2','Feature 3'])
Agg['Labels']=labels

In [None]:
plt.figure(figsize=(8, 8))
px.scatter_3d(Agg,x='Feature 1', y='Feature 2',z='Feature 3', color='Labels',title='AGG - Complete' )

# Conclusion

In [None]:
Conclusions = pd.DataFrame(Conclusions[1:], columns=Conclusions[0])
Conclusions

In [None]:
#Sort conclusions from greater to lower in test
Conclusions = Conclusions.sort_values(by='Test', ascending=False)
Conclusions