<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/cnnAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np

# ------------------------
# 1. Load Raw Datasets
# ------------------------
enrollments = pd.read_csv("enrollment_train.csv")
logs = pd.read_csv("log_train.csv")
labels = pd.read_csv("truth_train.csv", header=None, names=["enrollment_id", "label"])

# ------------------------
# 2. Normalize & Parse Time
# ------------------------
logs['time'] = pd.to_datetime(logs['time'])
min_time = logs.groupby('enrollment_id')['time'].min().rename("start_time")
logs = logs.merge(min_time, on='enrollment_id')
logs['day'] = (logs['time'] - logs['start_time']).dt.days

# Keep only the first 30 days
logs = logs[(logs['day'] >= 0) & (logs['day'] < 30)]

# ------------------------
# 3. Define Event Types
# ------------------------
event_types = ['problem', 'video', 'access', 'wiki', 'discussion', 'navigate', 'page_close']
logs = logs[logs['event'].isin(event_types)]

# ------------------------
# 4. Count Events per Day
# ------------------------
pivot = (
    logs.groupby(['enrollment_id', 'day', 'event'])
    .size()
    .unstack(fill_value=0)
    .reindex(columns=event_types, fill_value=0)
    .reset_index()
)

# Fill in missing (enrollment_id, day) pairs
all_enrollments = enrollments['enrollment_id'].unique()
all_days = np.arange(30)
full_index = pd.MultiIndex.from_product([all_enrollments, all_days], names=['enrollment_id', 'day'])
pivot = pivot.set_index(['enrollment_id', 'day']).reindex(full_index, fill_value=0).reset_index()

# ------------------------
# 5. Flatten into 210-D Vector
# ------------------------
flattened = []
for eid, group in pivot.groupby('enrollment_id'):
    vector = group[event_types].values.flatten()
    flattened.append((eid, vector))

feature_df = pd.DataFrame(flattened, columns=["enrollment_id", "feature_vector"])

# ------------------------
# 6. Merge with Labels
# ------------------------
feature_df = feature_df.merge(labels, on="enrollment_id", how="left")
feature_df['label'] = feature_df['label'].fillna(0).astype(int)  # treat missing as non-dropout

# ------------------------
# 7. Save or Use
# ------------------------
# To CSV:
feature_df.to_csv("preprocessed_features.csv", index=False)

# To continue in-memory:
print(feature_df.head())


   enrollment_id                                     feature_vector  label
0              1  [8, 0, 3, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...      0
1              3  [14, 1, 7, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0,...      0
2              4  [5, 2, 18, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0,...      0
3              5  [0, 0, 0, 0, 0, 2, 0, 16, 10, 28, 0, 2, 8, 13,...      0
4              6  [2, 2, 12, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0,...      0


In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# -------------------------------
# 1. Load and Preprocess Dataset
# -------------------------------
# Assuming 'features_df' is already in memory and includes:
# ['enrollment_id', 'feature_vector', 'label']

# If you're loading from CSV, use:
features_df = pd.read_csv('preprocessed_features.csv')
#features_df['feature_vector'] = features_df['feature_vector'].apply(eval)

features_df['feature_vector'] = features_df['feature_vector'].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=' ')
)


# Step 1: Convert feature_vector to a numpy array
X = np.array([np.array(vec).reshape(30, 7) for vec in features_df['feature_vector']])
y = features_df['label'].fillna(0).astype(int).values

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# 2. Build CNNAE-LSTM Model
# -------------------------------

# Encoder
def build_cnn_encoder(input_shape):
    inputs = Input(shape=input_shape)
    x = layers.Conv1D(64, kernel_size=5, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(32, kernel_size=5, activation='relu', padding='same')(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    encoded = layers.Conv1D(16, kernel_size=5, activation='relu', padding='same')(x)
    return models.Model(inputs, encoded, name='cnn_encoder')

# Decoder
def build_cnn_decoder(input_shape):
    inputs = Input(shape=input_shape)
    x = layers.UpSampling1D(size=2)(inputs)
    x = layers.Conv1D(32, kernel_size=5, activation='relu', padding='same')(x)
    x = layers.UpSampling1D(size=2)(x)
    x = layers.Conv1D(64, kernel_size=5, activation='relu', padding='same')(x)
    decoded = layers.Conv1D(7, kernel_size=3, activation='sigmoid', padding='same')(x)
    return models.Model(inputs, decoded, name='cnn_decoder')

# Assemble CNNAE-LSTM
input_shape = (30, 7)
encoder = build_cnn_encoder(input_shape)
encoded_output = encoder.output

# Reshape for LSTM
reshaped = layers.Reshape((encoded_output.shape[1], encoded_output.shape[2]))(encoded_output)
lstm_out = layers.LSTM(128, activation='tanh', dropout=0.5)(reshaped)
dense = layers.Dense(64, activation='relu')(lstm_out)
dropout_pred = layers.Dense(1, activation='sigmoid', name='dropout_output')(dense)

# Decoder for reconstruction
decoder = build_cnn_decoder(encoded_output.shape[1:])
reconstructed = decoder(encoded_output)

# Final Model
model = models.Model(inputs=encoder.input, outputs=[reconstructed, dropout_pred])
model.compile(
    optimizer='adam',
    loss={'cnn_decoder': 'mse', 'dropout_output': 'binary_crossentropy'},
    loss_weights={'cnn_decoder': 0.5, 'dropout_output': 1.0},
    metrics={'dropout_output': ['accuracy']}
)

model.summary()

# -------------------------------
# 3. Train Model
# -------------------------------

history = model.fit(
    X_train,
    {'cnn_decoder': X_train, 'dropout_output': y_train},
    epochs=20,
    batch_size=128,
    validation_split=0.1,
    verbose=1
)

# -------------------------------
# 4. Evaluate Model
# -------------------------------

# Predict on test set
decoded_out, y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Metrics
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))
print("✅ Test Accuracy:", np.mean(y_pred.flatten() == y_test))
print("🔥 AUC Score:", roc_auc_score(y_test, y_pred_prob))


Epoch 1/20


ValueError: Dimensions must be equal, but are 30 and 28 for '{{node compile_loss/mse/sub}} = Sub[T=DT_FLOAT](data_1, functional_1_1/cnn_decoder_1/conv1d_11_1/Sigmoid)' with input shapes: [?,30,7], [?,28,7].

In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# ------------------------------
# 1. Load Preprocessed Features
# ------------------------------
# If you're loading from disk:
features_df = pd.read_csv("preprocessed_features.csv")
features_df['feature_vector'] = features_df['feature_vector'].apply(
    lambda x: np.fromstring(x.strip("[]"), sep=' ')
)

#features_df['feature_vector'] = features_df['feature_vector'].apply(eval)  # only if it's a real Python list string

# If already in memory (from previous steps):
# Assume features_df contains: ['enrollment_id', 'feature_vector', 'label']

# Convert to 30×7 matrices
X = np.array([np.array(vec).reshape(30, 7) for vec in features_df['feature_vector']])
y = features_df['label'].fillna(0).astype(int).values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------------
# 2. Build CNNAE-LSTM Model
# ------------------------------

# Encoder
def build_encoder(input_shape):
    inputs = Input(shape=input_shape)
    x = layers.Conv1D(64, 3, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling1D(2, padding='same')(x)  # 30 → 15
    encoded = layers.Conv1D(32, 3, activation='relu', padding='same')(x)
    return models.Model(inputs, encoded, name="cnn_encoder")

# Decoder
def build_decoder(encoded_shape):
    inputs = Input(shape=encoded_shape)
    x = layers.UpSampling1D(2)(inputs)             # 15 → 30
    x = layers.Conv1D(64, 3, activation='relu', padding='same')(x)
    decoded = layers.Conv1D(7, 3, activation='sigmoid', padding='same')(x)
    return models.Model(inputs, decoded, name="cnn_decoder")

# Combine into full model
input_shape = (30, 7)
encoder = build_encoder(input_shape)
encoded_output = encoder.output

# LSTM branch
x = layers.Reshape((encoded_output.shape[1], encoded_output.shape[2]))(encoded_output)
x = layers.LSTM(128, dropout=0.5)(x)
x = layers.Dense(64, activation='relu')(x)
dropout_pred = layers.Dense(1, activation='sigmoid', name='dropout_output')(x)

# Decoder branch
decoder = build_decoder(encoded_output.shape[1:])
reconstructed = decoder(encoded_output)

# Final model
model = models.Model(inputs=encoder.input, outputs=[reconstructed, dropout_pred])
model.compile(
    optimizer='adam',
    loss={'cnn_decoder': 'mse', 'dropout_output': 'binary_crossentropy'},
    loss_weights={'cnn_decoder': 0.5, 'dropout_output': 1.0},
    metrics={'dropout_output': ['accuracy']}
)

model.summary()

# ------------------------------
# 3. Train the Model
# ------------------------------
history = model.fit(
    X_train,
    {'cnn_decoder': X_train, 'dropout_output': y_train},
    validation_split=0.1,
    epochs=20,
    batch_size=128,
    verbose=1
)

# ------------------------------
# 4. Evaluate the Model
# ------------------------------
decoded_output, y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))
print(f"✅ Accuracy: {(y_pred.flatten() == y_test).mean():.4f}")
print(f"🔥 AUC Score: {roc_auc_score(y_test, y_pred_prob):.4f}")


Epoch 1/20
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 85ms/step - cnn_decoder_loss: 7.9580 - dropout_output_accuracy: 0.8537 - dropout_output_loss: 0.3865 - loss: 4.3655 - val_cnn_decoder_loss: 7.1371 - val_dropout_output_accuracy: 0.8648 - val_dropout_output_loss: 0.3488 - val_loss: 3.9298
Epoch 2/20
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 73ms/step - cnn_decoder_loss: 7.8225 - dropout_output_accuracy: 0.8657 - dropout_output_loss: 0.3458 - loss: 4.2570 - val_cnn_decoder_loss: 7.1304 - val_dropout_output_accuracy: 0.8668 - val_dropout_output_loss: 0.3451 - val_loss: 3.9228
Epoch 3/20
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 75ms/step - cnn_decoder_loss: 8.9913 - dropout_output_accuracy: 0.8637 - dropout_output_loss: 0.3479 - loss: 4.8434 - val_cnn_decoder_loss: 7.1283 - val_dropout_output_accuracy: 0.8685 - val_dropout_output_loss: 0.3416 - val_loss: 3.9181
Epoch 4/20
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━