<a href="https://colab.research.google.com/github/aymenchibouti/doctorat/blob/main/CNN_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, Flatten, Input, Concatenate, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Step 1: Data Loading ###

# Load the files
enrollment_train = pd.read_csv('enrollment_train.csv')
log_train = pd.read_csv('log_train.csv')
truth_train = pd.read_csv('truth_train.csv',header=None)
truth_train.columns=['enrollment_id', 'dropout']
course_meta = pd.read_csv('course_meta.csv')

### Step 2: Data Preprocessing ###

# Merge enrollment and truth files
train_data = pd.merge(enrollment_train, truth_train, on='enrollment_id', how='left')

# Convert time column to datetime
log_train['time'] = pd.to_datetime(log_train['time'])

# Sort log by time to preserve interaction sequence
log_train = log_train.sort_values(by=['enrollment_id', 'time'])

# Group logs by enrollment ID
grouped_logs = log_train.groupby('enrollment_id')

# Convert 'event' and 'source' to category codes
log_train['event_code'] = log_train['event'].astype('category').cat.codes
log_train['source_code'] = log_train['source'].astype('category').cat.codes

# Function to aggregate event and source codes for each enrollment
def aggregate_sequences(group):
    return {
        'event_sequence': group['event_code'].tolist(),
        'source_sequence': group['source_code'].tolist(),
        'time_sequence': group['time'].tolist()
    }



In [4]:
# Apply the aggregation to each enrollment
sequences = grouped_logs.apply(aggregate_sequences).reset_index()

# Truncate or pad sequences to a fixed length
max_seq_length = 10



In [None]:
# Apply padding for event and source sequences
sequences['event_sequence'] = pad_sequences(sequences['event_sequence'], maxlen=max_seq_length, padding='post', truncating='post').tolist()
sequences['source_sequence'] = pad_sequences(sequences['source_sequence'], maxlen=max_seq_length, padding='post', truncating='post').tolist()



In [None]:

# Merge sequences with the enrollment data
train_data = pd.merge(train_data, sequences, on='enrollment_id', how='left')


In [None]:

# Merge course metadata into the main training data
train_data = pd.merge(train_data, course_meta, on='course_id', how='left')


In [None]:

# Feature engineering: Calculate course duration (days)
train_data['course_duration'] = (pd.to_datetime(train_data['end_time']) - pd.to_datetime(train_data['start_time'])).dt.days



In [None]:
# Count the number of interactions per enrollment
interaction_counts = log_train.groupby('enrollment_id').size().reset_index(name='total_interactions')
train_data = pd.merge(train_data, interaction_counts, on='enrollment_id', how='left')



In [None]:
# Prepare input sequences (event and source sequences)
X_sequences = np.stack(train_data['event_sequence'].values)
X_sources = np.stack(train_data['source_sequence'].values)


In [None]:

# Prepare static features (e.g., total interactions, course duration)
X_static = train_data[['total_interactions', 'course_duration']].fillna(0).values


In [None]:

# Prepare output labels (dropout or not)
y = train_data['dropout'].values


In [None]:

# Split the data into training and testing sets
X_train_seq, X_test_seq, X_train_static, X_test_static, y_train, y_test = train_test_split(
    X_sequences, X_static, y, test_size=0.2, random_state=42)



In [None]:
### Step 3: Build CNN-Transformer Model ###

# Hyperparameters
seq_length = 10
num_static_features = 2
embedding_dim = 32
num_heads = 4
ff_dim = 32
dropout_rate = 0.2

# Input for event sequence (sequence of interaction events)
event_input = Input(shape=(seq_length,), name='event_input')
# Input for source sequence (sequence of sources - browser/server)
source_input = Input(shape=(seq_length,), name='source_input')
# Input for static features (e.g., total interactions, course duration)
static_input = Input(shape=(num_static_features,), name='static_input')

# Embedding layers for event and source sequences
event_embedding = tf.keras.layers.Embedding(input_dim=20, output_dim=embedding_dim, input_length=seq_length)(event_input)
source_embedding = tf.keras.layers.Embedding(input_dim=5, output_dim=embedding_dim, input_length=seq_length)(source_input)

# Concatenate the embeddings
combined_seq = Concatenate()([event_embedding, source_embedding])

# CNN layer to capture local patterns in sequences
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(combined_seq)
cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)

# Flatten the CNN output for integration with Transformer
flattened_seq = Flatten()(cnn_layer)

# Transformer Encoder for long-range dependencies
transformer_encoder = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(cnn_layer, cnn_layer)
transformer_encoder = LayerNormalization()(transformer_encoder)
transformer_encoder = Dropout(dropout_rate)(transformer_encoder)

# Combine CNN and Transformer outputs
combined_transformer = Concatenate()([flattened_seq, Flatten()(transformer_encoder)])

# Fully connected layers
dense_layer = Dense(64, activation='relu')(combined_transformer)
dense_layer = Dropout(dropout_rate)(dense_layer)

# Concatenate static features
combined_with_static = Concatenate()([dense_layer, static_input])

# Final dense layer for classification
output = Dense(1, activation='sigmoid')(combined_with_static)

# Model definition
model = Model(inputs=[event_input, source_input, static_input], outputs=output)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss=BinaryCrossentropy(), metrics=['accuracy'])

# Print model summary
model.summary()

### Step 4: Train the Model ###

# Train the model
history = model.fit(
    [X_train_seq, X_train_seq, X_train_static],  # event_sequence and source_sequence are the same in this case
    y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=64
)

### Step 5: Evaluate the Model ###

# Make predictions on the test set
y_pred_prob = model.predict([X_test_seq, X_test_seq, X_test_static])
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary output

# Calculate accuracy and precision
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")