In [20]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import ast

In [21]:
df = pd.read_csv('../Datasets/amixem_20251023.csv')
df['categories'] = df['categories'].apply(ast.literal_eval)
df['tags'] = df['tags'].apply(ast.literal_eval)
df['upload_date'] = pd.to_datetime(df['upload_date'], format='%Y%m%d')
df = df.sort_values('timestamp')

In [22]:
# Features temporelles
df['day_of_week'] = df['upload_date'].dt.dayofweek
df['hour'] = df['upload_date'].dt.hour
df['month'] = df['upload_date'].dt.month

# Features cycliques
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# Contexte historique
df['avg_views_last_5'] = df['view_count'].rolling(5).mean()
df['days_since_last'] = df['timestamp'].diff() / 86400

In [23]:
from sklearn.preprocessing import MultiLabelBinarizer

# Encoder tags
mlb_tags = MultiLabelBinarizer()
tags_encoded = mlb_tags.fit_transform(df['tags'])

# Garder top-50
from collections import Counter
all_tags = [tag for tags in df['tags'] for tag in tags]
top_tags = [tag for tag, _ in Counter(all_tags).most_common(50)]
df['tags_filtered'] = df['tags'].apply(lambda x: [t for t in x if t in top_tags])

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Uniquement les videos de 2021 et apres
df_2021 = df[df['upload_date'] >= '2021-01-01']

# Split chronologique
print(f"Total videos since the beginning: {len(df)}")

train_idx = int(n * 0.9)
val_idx = int(n * 0.95)

train = df_2021[:train_idx]
val = df_2021[train_idx:val_idx]
test = df_2021[val_idx:]

print(f"Train: {train['upload_date'].min()} → {train['upload_date'].max()}")
print(f"Test: {test['upload_date'].min()} → {test['upload_date'].max()}")

Total videos since the beginning: 830
Train: 2021-01-02 00:00:00 → 2025-03-02 00:00:00
Test: 2025-06-29 00:00:00 → 2025-10-19 00:00:00


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Préparer séquences
def create_sequences(data, seq_length=10):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

# Modèle LSTM
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(seq_length, n_features)),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(n_outputs)  # duration + categories + tags
])

model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val))

NameError: name 'seq_length' is not defined