My DLS 2024 Finals Solution for Problem E. Просмотры видео. I have used simple tokenizer mixed with Embedding + LSTM for text feature, preprocessed date features and categorial feature, then using FC layer to get a result.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import re
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter
from nltk import word_tokenize
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train = pd.read_csv('drive/MyDrive/youtube_train.csv')
test = pd.read_csv('drive/MyDrive/youtube_test.csv')
X, y = train.drop(columns=['ViewCount']), train['ViewCount']
X.shape

(5849, 5)

In [None]:
X.head(5)

Unnamed: 0,id,Channel,Subtitles,PublishDate,Category
0,0,toplesofficial,('Я миллениал и я не занимаюсь сексом так част...,2017-12-31T01:35:44-08:00,Education
1,1,postnauka,"(""сервитуты это одно из прямых наследие римско...",2014-09-04T08:24:47-07:00,Science & Technology
2,2,NaukaPRO,('[музыка] существуют сенсоры измеряющие магни...,2023-11-23T21:00:08-08:00,Science & Technology
3,3,postnauka,('в наше время только ленивые не говорит о том...,2014-06-23T03:49:31-07:00,Science & Technology
4,4,user-rb8ux1no6j,('дорогие друзья это честный рекламный ролик н...,2020-10-09T01:30:04-07:00,Education


In [None]:
class VideoDatasetPreprocessor:
    def __init__(self, max_subtitle_len=1000):
        self.max_subtitle_len = max_subtitle_len
        self.word2idx = None
        self.idx2word = None
        self.category_encoder = LabelEncoder()
        self.scaler = StandardScaler()

    def clean_text(self, text):
        if pd.isna(text) or not isinstance(text, str):
            return ""
        text = re.sub(r'\s+', ' ', str(text))
        text = text.lower().strip()
        text = re.sub(r'[^\w\s]', '', text)
        return text

    def tokenize(self, text):
        try:
            return word_tokenize(text)
        except:
            return text.split()

    def build_vocabulary(self, texts):
        all_tokens = []
        for text in texts:
            cleaned = self.clean_text(text)
            tokens = self.tokenize(cleaned)
            all_tokens.extend(tokens)

        word_counts = Counter(all_tokens)
        most_common = word_counts.most_common()

        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}

        for idx, (word, _) in enumerate(most_common, start=2):
            self.word2idx[word] = idx
            self.idx2word[idx] = word

    def encode_text(self, text):
        cleaned = self.clean_text(text)
        tokens = self.tokenize(cleaned)

        tokens = tokens[:self.max_subtitle_len]

        indices = [self.word2idx.get(token, 1) for token in tokens]

        if len(indices) < self.max_subtitle_len:
            indices += [0] * (self.max_subtitle_len - len(indices))

        return indices

    def process_dates(self, dates):
        date_features = []
        for date_str in dates:
            try:
                dt = datetime.fromisoformat(date_str.replace('Z', '+00:00'))

                # Извлекаем компоненты
                year = dt.year
                month = dt.month
                day = dt.day
                dayofweek = dt.weekday()
                hour = dt.hour

                # Циклические признаки для периодичности
                month_sin = np.sin(2 * np.pi * month / 12)
                month_cos = np.cos(2 * np.pi * month / 12)
                day_sin = np.sin(2 * np.pi * day / 31)
                day_cos = np.cos(2 * np.pi * day / 31)
                hour_sin = np.sin(2 * np.pi * hour / 24)
                hour_cos = np.cos(2 * np.pi * hour / 24)

                date_features.append([
                    year, month, day, dayofweek, hour,
                    month_sin, month_cos, day_sin, day_cos, hour_sin, hour_cos
                ])
            except:
                # Если дата не парсится, добавляем нули
                date_features.append([0] * 11)

        return np.array(date_features, dtype=np.float32)

    def fit_transform(self, df):
        print("Building vocabulary")
        self.build_vocabulary(df['Subtitles'].values)

        print("Encoding texts")
        encoded_texts = [self.encode_text(text) for text in df['Subtitles'].values]

        print("Encoding categorical features...")
        encoded_categories = self.category_encoder.fit_transform(df['Category'].fillna('unknown')).reshape(-1, 1)

        print("Processing dates...")
        date_features = self.process_dates(df['PublishDate'].values)
        print(f"Date features shape: {date_features.shape}")  # Должно быть (n_samples, 11)

        print("Preparing targets...")
        targets = np.log1p(df['ViewCount'].values).reshape(-1, 1)

        numeric_features = encoded_categories
        numeric_features = self.scaler.fit_transform(numeric_features)

        return {
            'text_indices': torch.LongTensor(encoded_texts),
            'date_features': torch.FloatTensor(date_features),
            'targets': torch.FloatTensor(targets),
            'categories': torch.LongTensor(encoded_categories.flatten())
        }

In [None]:
class MyPredictor(nn.Module):
    def __init__(self, n_categories, vocab_size, n_date_features=11, emb_dim=128, hidden_dim=256):
        super(MyPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)

        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=2, dropout=0.3, bidirectional=True)

        self.categorial_emb = nn.Embedding(n_categories, 16)
        total_features = 2 * hidden_dim + n_date_features + 16

        print(f"Total features dimension: {total_features}")

        self.fc_layers = nn.Sequential(
            nn.Linear(total_features, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, dates, text, category):
        batch_size = text.size(0)  # 32

        # 1. Эмбеддинги для текста
        embedded = self.embedding(text)  # (32, 1000, 128)

        # 2. Пропускаем через LSTM
        lstm_out, (hidden, cell) = self.lstm(embedded)  # lstm_out: (32, 1000, 512)

        # 3. Используем средний пулинг по всем временным шагам
        # Это часто работает лучше, чем последнее состояние
        text_features = torch.mean(lstm_out, dim=1)  # (32, 512)

        # 4. Эмбеддинг категории
        category_features = self.categorial_emb(category)  # (32, 16)

        # 5. Объединяем все признаки
        combined = torch.cat([text_features, dates, category_features], dim=1)  # (32, 512+11+16=539)

        # 6. Предсказание
        output = self.fc_layers(combined)  # (32, 1)

        return output.squeeze()  # (32,)

In [None]:
preprocessor = VideoDatasetPreprocessor()
train_data = preprocessor.fit_transform(train)

Building vocabulary
Encoding texts
Encoding categorical features...
Processing dates...
Preparing targets...


In [None]:
indices = np.arange(len(train_data['targets']))
train_idx, val_idx = train_test_split(indices, test_size=0.2)

train_new = {k: v[train_idx] for k, v in train_data.items()}
val_new = {k: v[val_idx] for k, v in train_data.items()}

print(f"Train text_indices shape: {train_new['text_indices'].shape}")
print(f"Train date_features shape: {train_new['date_features'].shape}")
print(f"Train targets shape: {train_new['targets'].shape}")
print(f"Train categories shape: {train_new['categories'].shape}")

class MyDataset(Dataset):
    def __init__(self, data):
        self.text = data['text_indices']
        self.y = data['targets']
        self.categories = data['categories']
        self.dates = data['date_features']

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return {
            'text': self.text[idx],
            'numeric': self.categories[idx],
            'date': self.dates[idx],
            'target': self.y[idx]
        }



Train text_indices shape: torch.Size([4679, 1000])
Train date_features shape: torch.Size([4679, 11])
Train targets shape: torch.Size([4679, 1])
Train categories shape: torch.Size([4679])


In [None]:
train_dataset = MyDataset(train_new)
val_dataset = MyDataset(val_new)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [None]:
iterator = iter(train_dataset)
a = next(iterator)
print(a['text'].shape)

batch = iter(train_loader)
print(next(batch)['text'].shape)

torch.Size([1000])
torch.Size([32, 1000])


In [None]:
def train_epoch(model, train_loader, criterion, opt, epoch, device):
    epoch_loss = []
    model.train()

    for batch in train_loader:
        text = batch['text'].to(device)
        category = batch['numeric'].to(device)
        dates = batch['date'].to(device)
        y = batch['target'].to(device)


        y_pred = model(dates, text, category)
        loss = criterion(y, y_pred)
        loss.backward()
        opt.step()
        opt.zero_grad()

        epoch_loss.append(loss.item())

    print(f'{epoch=}')
    print('TRAIN LOSS', np.mean(epoch_loss))
    return np.mean(epoch_loss)


def validate_epoch(model, val_loader, criterion, epoch, device):
    epoch_loss = []
    model.eval()

    with torch.no_grad():
        for batch in val_loader:
            text = batch['text'].to(device)
            category = batch['numeric'].to(device)
            dates = batch['date'].to(device)
            y = batch['target'].to(device)

            y_pred = model(dates, text, category)
            loss = criterion(y, y_pred)

            epoch_loss.append(loss.item())

    print(f'{epoch=}')
    print('VAL LOSS', np.mean(epoch_loss))
    return np.mean(epoch_loss)

In [None]:
print(len(preprocessor.word2idx))
criterion = nn.L1Loss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MyPredictor(
    vocab_size=len(preprocessor.word2idx),
    n_categories=len(preprocessor.category_encoder.classes_)
    ).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-4)

train_losses = []
val_losses = []
for epoch in range(10):
    print('Starting epoch', epoch + 1)
    train_epoch_loss = train_epoch(model, train_loader, criterion, opt, epoch, device)
    val_epoch_loss = validate_epoch(model, val_loader, criterion, epoch, device)
    train_losses.append(train_epoch_loss)
    val_losses.append(val_epoch_loss)

476822
Total features dimension: 539
Starting epoch 1
epoch=0
TRAIN LOSS 2.084507692427862
epoch=0
VAL LOSS 1.5292059150902
Starting epoch 2
epoch=1
TRAIN LOSS 1.5648709413956623
epoch=1
VAL LOSS 1.5687569041509886
Starting epoch 3
epoch=2
TRAIN LOSS 1.5086471645199522
epoch=2
VAL LOSS 1.5981265467566412
Starting epoch 4
epoch=3
TRAIN LOSS 1.482549013329201
epoch=3
VAL LOSS 1.6640638080803123
Starting epoch 5
epoch=4
TRAIN LOSS 1.4812767716492115
epoch=4
VAL LOSS 1.7254987246281392
Starting epoch 6
epoch=5
TRAIN LOSS 1.4650103486314112
epoch=5
VAL LOSS 1.55348793235985
Starting epoch 7
epoch=6
TRAIN LOSS 1.4626837441710387
epoch=6
VAL LOSS 1.5776487862741626
Starting epoch 8
epoch=7
TRAIN LOSS 1.4589193901237176
epoch=7
VAL LOSS 1.6756690611710419
Starting epoch 9
epoch=8
TRAIN LOSS 1.4580334875048424
epoch=8
VAL LOSS 1.5492989308125265
Starting epoch 10
epoch=9
TRAIN LOSS 1.451703253246489
epoch=9
VAL LOSS 1.591172078171292


In [None]:
mean_error = []
with torch.no_grad():
    for batch in val_loader:
        text = batch['text'].to(device)
        category = batch['numeric'].to(device)
        dates = batch['date'].to(device)
        y = batch['target'].to(device)

        y_pred = model(dates, text, category)
        loss = mean_absolute_error(y.cpu().numpy(), y_pred.cpu().numpy())

        mean_error.append(loss)
print(np.mean(mean_error))

1.5919744839539398
