<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/XGB_LIGHT_MLP_VOTING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries (if not installed)
!pip install xgboost

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Load the dataset
data = pd.read_csv('model1_210_features.csv')

# Drop non-numeric columns
data_cleaned = data.drop(columns=['username', 'course_id', 'enrollment_id'])
data_cleaned = data_cleaned.fillna(data_cleaned.mean())  # Fill missing values

# Separate features and target
X = data_cleaned.drop(columns=['dropout'])
y = data_cleaned['dropout']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
xgb_model = XGBClassifier(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
}

# Apply GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_xgb_model = grid_search.best_estimator_

# Evaluate the model
y_pred_xgb = best_xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Optimized XGBoost Accuracy: {accuracy_xgb}")


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Optimized XGBoost Accuracy: 0.8654029615496288


In [2]:
# Install required libraries (if not installed)
!pip install lightgbm

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# Load the dataset
data = pd.read_csv('model1_210_features.csv')

# Drop non-numeric columns
data_cleaned = data.drop(columns=['username', 'course_id', 'enrollment_id'])
data_cleaned = data_cleaned.fillna(data_cleaned.mean())  # Fill missing values

# Separate features and target
X = data_cleaned.drop(columns=['dropout'])
y = data_cleaned['dropout']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
lgb_model = lgb.LGBMClassifier(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Apply GridSearchCV
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_lgb_model = grid_search.best_estimator_

# Evaluate the model
y_pred_lgb = best_lgb_model.predict(X_test)
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
print(f"Optimized LightGBM Accuracy: {accuracy_lgb}")


Fitting 3 folds for each of 72 candidates, totalling 216 fits
[LightGBM] [Info] Number of positive: 76533, number of negative: 19900
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.152360 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9582
[LightGBM] [Info] Number of data points in the train set: 96433, number of used features: 210
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.793639 -> initscore=1.347002
[LightGBM] [Info] Start training from score 1.347002
Optimized LightGBM Accuracy: 0.8654859181218632


In [3]:
# Install required libraries (if not installed)
!pip install scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Load the dataset
data = pd.read_csv('model1_210_features.csv')

# Drop non-numeric columns
data_cleaned = data.drop(columns=['username', 'course_id', 'enrollment_id'])
data_cleaned = data_cleaned.fillna(data_cleaned.mean())  # Fill missing values

# Separate features and target
X = data_cleaned.drop(columns=['dropout'])
y = data_cleaned['dropout']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
rf_model = RandomForestClassifier(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Apply GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_rf_model = grid_search.best_estimator_

# Evaluate the model
y_pred_rf = best_rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Optimized Random Forest Accuracy: {accuracy_rf}")


Fitting 3 folds for each of 54 candidates, totalling 162 fits
Optimized Random Forest Accuracy: 0.8488116471027417


In [4]:
# Install required libraries (if not installed)
!pip install scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Load the dataset
data = pd.read_csv('model1_210_features.csv')

# Drop non-numeric columns
data_cleaned = data.drop(columns=['username', 'course_id', 'enrollment_id'])
data_cleaned = data_cleaned.fillna(data_cleaned.mean())  # Fill missing values

# Separate features and target
X = data_cleaned.drop(columns=['dropout'])
y = data_cleaned['dropout']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
mlp_model = MLPClassifier(random_state=42, max_iter=300)

# Define hyperparameters to tune
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.001, 0.01]
}

# Apply GridSearchCV
grid_search = GridSearchCV(estimator=mlp_model, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_mlp_model = grid_search.best_estimator_

# Evaluate the model
y_pred_mlp = best_mlp_model.predict(X_test)
accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
print(f"Optimized MLP Accuracy: {accuracy_mlp}")


Fitting 3 folds for each of 48 candidates, totalling 144 fits


KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Define base models
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, random_state=42)),
    ('mlp', MLPClassifier(max_iter=300, random_state=42))
]

# Define the meta-model
meta_model = LogisticRegression()

# Create the Stacking model
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=meta_model)

# Train the model
stacking_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_stacking = stacking_model.predict(X_test)
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
print(f"Stacking Classifier Accuracy: {accuracy_stacking}")
