In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import ast

In [None]:
df = pd.read_csv('../Datasets/amixem_20251219.csv')
df['categories'] = df['categories'].apply(ast.literal_eval)
df['tags'] = df['tags'].apply(ast.literal_eval)
df['upload_date'] = pd.to_datetime(df['upload_date'], format='%Y%m%d')
df = df.sort_values('timestamp')

# Sort by date
df = df.sort_values('upload_date')
df.head()

Unnamed: 0,nid,title,thumbnail,description,channel_id,duration,view_count,age_limit,categories,tags,upload_date,timestamp,fulltitle,is_live,was_live,comment_count,webpage_url,likes,dislikes,sponsored
829,xX8iRavbXxE,► Premiers pas sur Borderlands 2 - Gameplay dé...,https://i.ytimg.com/vi/xX8iRavbXxE/maxresdefau...,Premier épidose (et essai d'enregistrement) du...,UCgvqvBoSHB1ctlyyhoHrGwQ,3500,347004,0,[Gaming],"[Borderlands 2, Borderlands, PC, Coop, co op, ...",2012-11-12,1352681950,► Premiers pas sur Borderlands 2 - Gameplay dé...,False,False,4100,https://www.youtube.com/watch?v=xX8iRavbXxE,10801,267,False
828,CH2s3kJuyVo,► Battlefield 3 : Gameplay à Tour Ziba [AEK-97...,https://i.ytimg.com/vi/CH2s3kJuyVo/maxresdefau...,Enorme partie multi sur Battlefield 3 avec Cra...,UCgvqvBoSHB1ctlyyhoHrGwQ,2255,47399,0,[Gaming],"[Battlefield 3, Tour Ziba, fr, Français, Coop,...",2012-11-14,1352887496,► Battlefield 3 : Gameplay à Tour Ziba [AEK-97...,False,False,234,https://www.youtube.com/watch?v=CH2s3kJuyVo,1290,40,False
827,5g0j29G2w58,► Les 10 meilleures planques de ZIBA TOWER [Ba...,https://i.ytimg.com/vi/5g0j29G2w58/maxresdefau...,Se planquer sur la map la plus nerveuse de Bat...,UCgvqvBoSHB1ctlyyhoHrGwQ,2036,16353,0,[Gaming],"[gameplay, tour ziba, Battlefield 3, Let's Pla...",2012-11-28,1354068885,► Les 10 meilleures planques de ZIBA TOWER [Ba...,False,False,97,https://www.youtube.com/watch?v=5g0j29G2w58,513,19,False
826,sSywkj82fwk,► FAR CRY 3 : Gameplay découverte [FR],https://i.ytimg.com/vi/sSywkj82fwk/maxresdefau...,Découverte de FAR CRY 3 !\nAu programme : \n1....,UCgvqvBoSHB1ctlyyhoHrGwQ,1442,37284,0,[Gaming],"[Fraps, gameplay, GTX 460, Let's play, Far Cry...",2012-12-05,1354703422,► FAR CRY 3 : Gameplay découverte [FR],False,False,121,https://www.youtube.com/watch?v=sSywkj82fwk,781,26,False
825,o4pLOu8gJQo,► Gameplay déjanté sur Battlefield 3 Aftermath...,https://i.ytimg.com/vi/o4pLOu8gJQo/maxresdefau...,Allez deuxième Gameplay avec crackPHI sur une ...,UCgvqvBoSHB1ctlyyhoHrGwQ,1577,11637,0,[Gaming],"[fraps, Gameplay, Aftermath, gtx 460, Let's pl...",2012-12-06,1354837440,► Gameplay déjanté sur Battlefield 3 Aftermath...,False,False,64,https://www.youtube.com/watch?v=o4pLOu8gJQo,361,12,False


In [22]:
# Features temporelles
df['day_of_week'] = df['upload_date'].dt.dayofweek
df['hour'] = df['upload_date'].dt.hour
df['month'] = df['upload_date'].dt.month

# Features cycliques
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# Contexte historique
df['avg_views_last_5'] = df['view_count'].rolling(5).mean()
df['days_since_last'] = df['timestamp'].diff() / 86400

In [23]:
from sklearn.preprocessing import MultiLabelBinarizer

# Encoder tags
mlb_tags = MultiLabelBinarizer()
tags_encoded = mlb_tags.fit_transform(df['tags'])

# Garder top-50
from collections import Counter
all_tags = [tag for tags in df['tags'] for tag in tags]
top_tags = [tag for tag, _ in Counter(all_tags).most_common(50)]
df['tags_filtered'] = df['tags'].apply(lambda x: [t for t in x if t in top_tags])

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Uniquement les videos de 2021 et apres
df_2021 = df[df['upload_date'] >= '2021-01-01']

# Split chronologique
print(f"Total videos since the beginning: {len(df)}")

train_idx = int(n * 0.9)
val_idx = int(n * 0.95)

train = df_2021[:train_idx]
val = df_2021[train_idx:val_idx]
test = df_2021[val_idx:]

print(f"Train: {train['upload_date'].min()} → {train['upload_date'].max()}")
print(f"Test: {test['upload_date'].min()} → {test['upload_date'].max()}")

Total videos since the beginning: 830
Train: 2021-01-02 00:00:00 → 2025-03-02 00:00:00
Test: 2025-06-29 00:00:00 → 2025-10-19 00:00:00


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Préparer séquences
def create_sequences(data, seq_length=10):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

# Modèle LSTM
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(seq_length, n_features)),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(n_outputs)  # duration + categories + tags
])

model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val))

NameError: name 'seq_length' is not defined