<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/autoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay
from keras import layers, models, backend as K
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier, plot_importance

#------------------ Load Data ------------------

enrollment_data = pd.read_csv('enrollment_train.csv')
log_data = pd.read_csv('log_train spliting.csv')
truth_data = pd.read_csv('truth_train.csv', header=None, names=['enrollment_id', 'dropout'])

#------------------ Feature Engineering ------------------

def extract_time_features(df):
  df['time'] = pd.to_datetime(df['time'])
  df['hour'] = df['time'].dt.hour
  df['dayofweek'] = df['time'].dt.dayofweek
  return df

log_data = extract_time_features(log_data)

agg_features = log_data.groupby('enrollment_id').agg({
    'event': 'count',
    'source': pd.Series.nunique,
    'object': pd.Series.nunique,
    'hour': ['mean', 'std'],
    'dayofweek': pd.Series.mode
})

agg_features.columns = ['_'.join(col).strip() for col in agg_features.columns.values]
agg_features.reset_index(inplace=True)

#------------------ Merge with Enrollment and Truth Data ------------------

data = enrollment_data.merge(agg_features, on='enrollment_id', how='left')
data = data.merge(truth_data, on='enrollment_id', how='left')

#Encode categorical features

le_course = LabelEncoder()
le_user = LabelEncoder()
data['course_id_enc'] = le_course.fit_transform(data['course_id'])
data['username_enc'] = le_user.fit_transform(data['username'])

feature_cols = ['event_count', 'source_nunique', 'object_nunique', 'hour_mean', 'hour_std', 'dayofweek_mode', 'course_id_enc', 'username_enc']
X = data[feature_cols].fillna(0)
y = data['dropout']

#------------------ Variational Autoencoder ------------------

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
input_dim = X_scaled.shape[1]
encoding_dim = 5
inputs = layers.Input(shape=(input_dim,))
h = layers.Dense(16, activation='relu')(inputs)
z_mean = layers.Dense(encoding_dim)(h)
z_log_var = layers.Dense(encoding_dim)(h)

def sampling(args):
  z_mean, z_log_var = args
  epsilon = K.random_normal(shape=(K.shape(z_mean)[0], encoding_dim))
  return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = layers.Lambda(sampling, output_shape=(encoding_dim,))([z_mean, z_log_var])

encoder = models.Model(inputs, z)

decoder_h = layers.Dense(16, activation='relu')
decoder_mean = layers.Dense(input_dim)
decoded = decoder_mean(decoder_h(z))

vae = models.Model(inputs, decoded)

reconstruction_loss = K.mean(K.square(decoded - inputs))
kl_loss = -0.5 * K.mean(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var))
vae_loss = reconstruction_loss + kl_loss

vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
vae.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=0)

latent_features = encoder.predict(X_scaled)
latent_df = pd.DataFrame(latent_features, columns=[f'latent_{i}' for i in range(encoding_dim)])

#------------------ Combine Features ------------------

X_combined = pd.concat([X.reset_index(drop=True), latent_df], axis=1)

#------------------ Train-Test Split ------------------

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, stratify=y, random_state=42)

#------------------ Hyperparameter Tuning (Expanded) ------------------

param_grid = {
     'n_estimators': [100, 300, 500],
     'max_depth': [3, 5, 7, 9],
     'learning_rate': [0.01, 0.05, 0.1, 0.2],
     'subsample': [0.7, 0.8, 1],
     'colsample_bytree': [0.7, 0.8, 1]
     }

random_search = RandomizedSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    param_distributions=param_grid,
    n_iter=20,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
    )

random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
xgb = random_search.best_estimator_

# ------------------ Evaluation ------------------
y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

# ------------------ Visualization ------------------
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm).plot()
plt.title("Confusion Matrix")
plt.show()

# ROC Curve
RocCurveDisplay.from_estimator(xgb, X_test, y_test)
plt.title("ROC Curve")
plt.show()

# Precision-Recall Curve
PrecisionRecallDisplay.from_estimator(xgb, X_test, y_test)
plt.title("Precision-Recall Curve")
plt.show()

# Feature Importance
plot_importance(xgb, max_num_features=10)
plt.title("Top 10 Feature Importances")
plt.show()

# Latent Space Visualization
latent_df['dropout'] = y.reset_index(drop=True)
sns.pairplot(latent_df, hue='dropout', diag_kind='kde')
plt.suptitle('VAE Latent Space by Dropout Status', y=1.02)
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'log_train.csv'