<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.utils import to_categorical

# Load the files
enrollment_df = pd.read_csv('/mnt/data/enrollment_train.csv')
log_df = pd.read_csv('/mnt/data/log_train spliting.csv')
truth_df = pd.read_csv('/mnt/data/truth_train.csv', header=None, names=['enrollment_id', 'dropout'])

# Preprocess log file
log_df['time'] = pd.to_datetime(log_df['time'])
min_time_per_enrollment = log_df.groupby('enrollment_id')['time'].min().reset_index().rename(columns={'time': 'start_time'})
log_df = log_df.merge(min_time_per_enrollment, on='enrollment_id')
log_df['day'] = (log_df['time'] - log_df['start_time']).dt.days + 1
log_df.drop(columns=['start_time'], inplace=True)

# Create event count per day
log_df['count'] = 1
pivot_df = log_df.pivot_table(index=['enrollment_id', 'day'], columns='event', values='count', aggfunc='sum', fill_value=0).reset_index()
pivot_wide_df = pivot_df.pivot_table(index='enrollment_id', columns='day')
pivot_wide_df.columns = [f"{event}_day{day}" for (event, day) in pivot_wide_df.columns]
pivot_wide_df = pivot_wide_df.reset_index()

# Merge all data
merged_df = enrollment_df.merge(truth_df, on='enrollment_id', how='left')
final_df = merged_df.merge(pivot_wide_df, on='enrollment_id', how='left')

# Prepare X and y
X = final_df.drop(columns=['enrollment_id', 'username', 'course_id', 'dropout']).fillna(0)
y = final_df['dropout']

# Reshape X to 3D for LSTM (samples, timesteps, features_per_timestep)
n_features = 7  # number of event types
n_timesteps = 30  # assuming maximum 30 days

# Extract event names from columns
events = sorted(list(set(col.split('_')[0] for col in X.columns)))

# Create a 3D array
X_3d = np.zeros((X.shape[0], n_timesteps, n_features))
for i, event in enumerate(events):
    for day in range(1, n_timesteps + 1):
        col_name = f"{event}_day{day}"
        if col_name in X.columns:
            X_3d[:, day - 1, i] = X[col_name].values

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_3d, y, test_size=0.2, random_state=42)

# Define LSTM model
model = Sequential()
model.add(Masking(mask_value=0., input_shape=(n_timesteps, n_features)))
model.add(LSTM(64, activation='tanh', return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
