<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/CNN_Model2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau



In [None]:
# Load the files
enrollment_df = pd.read_csv('enrollment_train.csv')
log_df = pd.read_csv('log_train spliting.csv')
truth_df = pd.read_csv('truth_train.csv', header=None, names=['enrollment_id', 'dropout'])

# Preprocess log file
log_df['time'] = pd.to_datetime(log_df['time'])
min_time_per_enrollment = log_df.groupby('enrollment_id')['time'].min().reset_index().rename(columns={'time': 'start_time'})
log_df = log_df.merge(min_time_per_enrollment, on='enrollment_id')
log_df['day'] = (log_df['time'] - log_df['start_time']).dt.days + 1
log_df.drop(columns=['start_time'], inplace=True)

# Create event count per day
log_df['count'] = 1
pivot_df = log_df.pivot_table(index=['enrollment_id', 'day'], columns='event', values='count', aggfunc='sum', fill_value=0).reset_index()
pivot_wide_df = pivot_df.pivot_table(index='enrollment_id', columns='day')
pivot_wide_df.columns = [f"{event}_day{day}" for (event, day) in pivot_wide_df.columns]
pivot_wide_df = pivot_wide_df.reset_index()

# Merge all data
merged_df = enrollment_df.merge(truth_df, on='enrollment_id', how='left')
final_df = merged_df.merge(pivot_wide_df, on='enrollment_id', how='left')

# Prepare X and y
X = final_df.drop(columns=['enrollment_id', 'username', 'course_id', 'dropout']).fillna(0)
y = final_df['dropout']

In [None]:
# Assuming final_df is already prepared with the merged data

# Define the number of days (time steps) and event types (features)
n_timesteps = 30  # 30 days for each student
event_types = ['access', 'problem', 'wiki', 'discussion', 'navigate', 'page_close', 'video']  # Example event types

# Create 3D array (time_steps x event_types) for each student
event_data = np.zeros((final_df.shape[0], n_timesteps, len(event_types)))

# Populate event_data array for each student and day
for i, student in final_df.iterrows():
    for day in range(1, n_timesteps + 1):
        for event in event_types:
            col_name = f"{event}_day{day}"
            if col_name in final_df.columns:
                event_data[i, day-1, event_types.index(event)] = student[col_name]

# Split the data into features and target variable
X = event_data  # Shape: (samples, time_steps, features)
y = final_df['dropout']  # Binary classification for dropout (0 or 1)

# Normalize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_reshaped = X.reshape(-1, X.shape[-1])  # Flatten the last dimension (features)
X_scaled = scaler.fit_transform(X_reshaped)
X_scaled = X_scaled.reshape(X.shape)  # Reshape back to original shape

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define CNN model for Model (2)
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(n_timesteps, len(event_types))))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))  # Dropout to avoid overfitting

model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))  # Dropout to avoid overfitting

model.add(Flatten())  # Flatten the output for the dense layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))  # Dropout in dense layer

model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Use early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Reduce learning rate when validation loss plateaus
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping, lr_scheduler])

# Evaluate model performance
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Plot Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()
