<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/lstm2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.utils import to_categorical




In [2]:
# Load the files
enrollment_df = pd.read_csv('enrollment_train.csv')
log_df = pd.read_csv('log_train spliting.csv')
truth_df = pd.read_csv('truth_train.csv', header=None, names=['enrollment_id', 'dropout'])

In [3]:
# Preprocess log file
log_df['time'] = pd.to_datetime(log_df['time'])
min_time_per_enrollment = log_df.groupby('enrollment_id')['time'].min().reset_index().rename(columns={'time': 'start_time'})
log_df = log_df.merge(min_time_per_enrollment, on='enrollment_id')
log_df['day'] = (log_df['time'] - log_df['start_time']).dt.days + 1
log_df.drop(columns=['start_time'], inplace=True)

In [4]:
# Create event count per day
log_df['count'] = 1
pivot_df = log_df.pivot_table(index=['enrollment_id', 'day'], columns='event', values='count', aggfunc='sum', fill_value=0).reset_index()
pivot_wide_df = pivot_df.pivot_table(index='enrollment_id', columns='day')
pivot_wide_df.columns = [f"{event}_day{day}" for (event, day) in pivot_wide_df.columns]
pivot_wide_df = pivot_wide_df.reset_index()

In [5]:
# Merge all data
merged_df = enrollment_df.merge(truth_df, on='enrollment_id', how='left')
final_df = merged_df.merge(pivot_wide_df, on='enrollment_id', how='left')

In [6]:
# Prepare X and y
X = final_df.drop(columns=['enrollment_id', 'username', 'course_id', 'dropout']).fillna(0)
y = final_df['dropout']

In [7]:
# Reshape X to 3D for LSTM (samples, timesteps, features_per_timestep)
n_features = 7  # number of event types
n_timesteps = 30  # assuming maximum 30 days


In [9]:
# Extract event names from columns
events = sorted(list(set(col.split('_')[0] for col in X.columns)))

In [10]:
events

['access', 'discussion', 'navigate', 'page', 'problem', 'video', 'wiki']

In [11]:
# Create a 3D array
X_3d = np.zeros((X.shape[0], n_timesteps, n_features))
for i, event in enumerate(events):
    for day in range(1, n_timesteps + 1):
        col_name = f"{event}_day{day}"
        if col_name in X.columns:
            X_3d[:, day - 1, i] = X[col_name].values

In [12]:
# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_3d, y, test_size=0.2, random_state=42)

In [16]:
X_train

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [13]:
# Define LSTM model
model = Sequential()
model.add(Masking(mask_value=0., input_shape=(n_timesteps, n_features)))
model.add(LSTM(64, activation='tanh', return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(**kwargs)


In [14]:
# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [15]:
# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 21ms/step - accuracy: 0.7493 - loss: 0.6301 - val_accuracy: 0.7398 - val_loss: 0.5737
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 20ms/step - accuracy: 0.7498 - loss: 0.5629 - val_accuracy: 0.7403 - val_loss: 0.5726
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 20ms/step - accuracy: 0.7513 - loss: 0.5604 - val_accuracy: 0.7403 - val_loss: 0.5726
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 20ms/step - accuracy: 0.7486 - loss: 0.5630 - val_accuracy: 0.7403 - val_loss: 0.5726
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 21ms/step - accuracy: 0.7482 - loss: 0.5634 - val_accuracy: 0.7403 - val_loss: 0.5727
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 20ms/step - accuracy: 0.7481 - loss: 0.5639 - val_accuracy: 0.7404 - val_loss: 0.5728
Epoc

<keras.src.callbacks.history.History at 0x7f4787938e50>

In [17]:
# Evaluate model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step


In [18]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[  133  4262]
 [   33 12757]]
              precision    recall  f1-score   support

           0       0.80      0.03      0.06      4395
           1       0.75      1.00      0.86     12790

    accuracy                           0.75     17185
   macro avg       0.78      0.51      0.46     17185
weighted avg       0.76      0.75      0.65     17185

