In [38]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import datetime
import time
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, f1_score
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [2]:
path = '/kaggle/input/rutube-videos/'
data = pd.read_csv(path + 'train_events.csv')
video = pd.read_csv(path + 'video_info_v2.csv')
targets = pd.read_csv(path + 'train_targets.csv')

In [3]:
merged_data = pd.merge(data, video, on='rutube_video_id')
merged_data = pd.merge(merged_data, targets, on='viewer_uid')
merged_data['duration'] = merged_data['duration'].apply(lambda x: x / 1000)

In [4]:
merged_data['sex'] = merged_data['sex'].apply(lambda x: 1 if x == 'female' else 0)

In [6]:
merged_data.head()

Unnamed: 0,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,title,category,duration,author_id,age,sex,age_class
0,2024-06-01 06:40:58+03:00,Chelyabinsk,desktop,browser,Windows,Yandex Browser,1883,video_133074,10067243,Папа с особенностями. Мужское / Женское. Выпус...,Телепередачи,2456.534,1009219,20,1,0
1,2024-06-01 19:33:24+03:00,Bashkortostan Republic,smartphone,mobile app,Android,Rutube,512,video_362960,10245341,Comedy Club: Мальдивы | Андрей Бебуришвили,Юмор,519.211,1006760,40,1,2
2,2024-06-01 21:30:43+03:00,St.-Petersburg,desktop,browser,Windows,Chrome,5647,video_96775,10894333,"Новая Битва экстрасенсов, 24 сезон, 11 выпуск",Телепередачи,5518.28,1009257,23,0,1
3,2024-06-01 23:03:42+03:00,Moscow,smartphone,mobile app,Android,Rutube,1521,video_161610,10029092,Сергей Орлов-снял дом!!!,Разное,1522.069,1058671,41,0,3
4,2024-06-01 22:48:09+03:00,Moscow,smartphone,mobile app,Android,Rutube,71,video_116245,10452976,Ищем сокровища в Полевском | Уральская Флоренц...,Путешествия,1249.92,1020020,38,1,2


In [103]:
def to_timestamp(s: str):
    ttp = datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S%z').timetuple()
    return time.mktime(ttp)

In [8]:
merged_data['event_datetime'] = merged_data['event_timestamp']
merged_data['event_timestamp'] = merged_data['event_timestamp'].apply(to_timestamp)

In [35]:
mentions = {}
sessions = {}
count_sessions = []
mean_duration = []
srtd = merged_data.sort_values('event_timestamp')
for uid in tqdm(merged_data['viewer_uid'].unique()):
    mentions[uid] = srtd['event_timestamp'][srtd['viewer_uid'] == uid].tolist()
    first_idx, last_idx = 0, 0
    sessions[uid] = []
    duration = 0
    for i, mention in enumerate(mentions[uid]):
        if mention - mentions[uid][last_idx] <= 1800:
            last_idx = i
        else:
            sessions[uid].append([mentions[uid][first_idx], mentions[uid][last_idx]])
            duration += mentions[uid][last_idx] - mentions[uid][first_idx]
            first_idx = i
            last_idx = i
    sessions[uid].append([mentions[uid][first_idx], mentions[uid][last_idx]])
    mean_duration.append(duration / len(sessions[uid]))
    count_sessions.append(len(sessions[uid]))

  0%|          | 0/180012 [00:00<?, ?it/s]

In [10]:
def get_day(s: str):
    return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S%z').day

In [38]:
num_days_online = []
srtd = merged_data.sort_values('event_datetime')
for uid in tqdm(merged_data['viewer_uid'].unique()):
    events = srtd['event_datetime'][srtd['viewer_uid'] == uid].tolist()
    num_days = 0
    now_day = 1
    for event in events:
        if get_day(event) != now_day:
            num_days += 1
            now_day = get_day(event)
    num_days_online.append(num_days)

  0%|          | 0/180012 [00:00<?, ?it/s]

In [45]:
features = pd.DataFrame({'id': merged_data['viewer_uid'].unique().tolist()})
gb = merged_data.groupby('viewer_uid')
features['all_time'] = gb['total_watchtime'].agg('sum').reset_index(drop=True)
features['mean_per_session'] = mean_duration
features['count_sessions'] = count_sessions
features['mean_per_day'] = features['all_time'] / num_days_online

In [46]:
features['common_category'] = gb['category'].agg(pd.Series.mode).apply(lambda x: x if not isinstance(x, np.ndarray) else x[0]).reset_index(drop=True)

In [47]:
features.head()

Unnamed: 0,id,all_time,mean_per_session,count_sessions,mean_per_day,common_category
0,10067243,68757,35.102564,39,4044.529412,Телепередачи
1,10245341,15220,41.258065,62,563.703704,Сериалы
2,10894333,16501,0.0,4,5500.333333,Наука
3,10029092,212,0.0,9,30.285714,Технологии и интернет
4,10452976,971,167.185185,27,74.692308,Аниме


In [14]:
most_popular = []
gb = merged_data.groupby(['viewer_uid', 'category'])['total_watchtime'].agg('sum')
for uid in tqdm(merged_data['viewer_uid'].unique()):
    maxval, bestcat = 0, ''
    for cat in gb[uid].index:
        if gb[uid][cat] > maxval:
            maxval = gb[uid][cat]
            bestcat = cat
    most_popular.append(bestcat)

  0%|          | 0/180012 [00:00<?, ?it/s]

In [48]:
features['most_watched_category'] = most_popular

In [49]:
features['age_class'] = merged_data.groupby('viewer_uid')['age_class'].agg(lambda x: x.iloc[0]).reset_index(drop=True)
features['sex'] = merged_data.groupby('viewer_uid')['sex'].agg(lambda x: x.iloc[0]).reset_index(drop=True)
features['fav_author'] = merged_data.groupby('viewer_uid')['author_id'].agg(pd.Series.mode).apply(lambda x: x if not isinstance(x, np.ndarray) else x[0]).reset_index(drop=True)

In [57]:
features.to_csv('top_features.csv', index=False)

In [5]:
features = pd.read_csv('/kaggle/input/rutube-videos/top_features.csv')

In [28]:
features['region'] = merged_data.groupby('viewer_uid')['region'].agg(lambda x: x.iloc[0]).reset_index(drop=True)

  features['region'] = merged_data.groupby('viewer_uid')['region'].agg(lambda x: x.iloc[0]).reset_index(drop=True)


In [74]:
merged_data['%wt'] = merged_data['total_watchtime']/merged_data['duration']
merged_data['watched_2x'] = merged_data['%wt'] > 2
merged_data['watched_3x'] = merged_data['%wt'] > 3
merged_data['watched_10x'] = merged_data['%wt'] > 10
merged_data['watched_50x'] = merged_data['%wt'] > 50
merged_data['watched_100x'] = merged_data['%wt'] > 100
merged_data['watched_95'] = merged_data['%wt'] > 0.95
merged_data['watched_90'] = merged_data['%wt'] > 0.90
merged_data['watched_80'] = merged_data['%wt'] > 0.80
merged_data['watched_50'] = merged_data['%wt'] > 0.50

merged_data['skipped_20'] = merged_data['%wt'] < 0.20
merged_data['skipped_10'] = merged_data['%wt'] < 0.10
merged_data['skipped_5'] = merged_data['%wt'] < 0.05

In [75]:
x1 = merged_data.groupby('viewer_uid')['watched_2x'].apply(lambda x: x.sum() / len(x)) 
x2 = merged_data.groupby('viewer_uid')['watched_3x'].apply(lambda x: x.sum() / len(x)) 
x3 = merged_data.groupby('viewer_uid')['watched_10x'].apply(lambda x: x.sum() / len(x)) 
x4 = merged_data.groupby('viewer_uid')['watched_50x'].apply(lambda x: x.sum() / len(x)) 
x5 = merged_data.groupby('viewer_uid')['watched_100x'].apply(lambda x: x.sum() / len(x)) 
x6 = merged_data.groupby('viewer_uid')['watched_95'].apply(lambda x: x.sum() / len(x)) 
x7 = merged_data.groupby('viewer_uid')['watched_90'].apply(lambda x: x.sum() / len(x)) 
x8 = merged_data.groupby('viewer_uid')['watched_80'].apply(lambda x: x.sum() / len(x)) 
x9 = merged_data.groupby('viewer_uid')['watched_50'].apply(lambda x: x.sum() / len(x)) 
x10 = merged_data.groupby('viewer_uid')['skipped_20'].apply(lambda x: x.sum() / len(x)) 
x11 = merged_data.groupby('viewer_uid')['skipped_10'].apply(lambda x: x.sum() / len(x)) 
x12 = merged_data.groupby('viewer_uid')['skipped_5'].apply(lambda x: x.sum() / len(x))

In [76]:
watched_features = x1 
for df in [x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12]: 
    watched_features = pd.merge(watched_features, df, on='viewer_uid') 
watched_features = watched_features.reset_index()
len(watched_features)

180012

In [14]:
texts = merged_data['title'].unique().tolist()

# Параметры для батчевой обработки
batch_size = 500  # Размер батча
embeddings = {}

# Получение эмбеддингов батчами
with torch.no_grad():  # Отключаем градиенты для оптимизации памяти
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True).to(torch.device('cuda'))
        outputs = bert_model(**inputs)
        
        # Берем эмбеддинг [CLS] токена для каждого текста в батче
        cls_embeddings = outputs.last_hidden_state.mean(1).cpu().numpy()
        for idx, emb in enumerate(cls_embeddings):
            embeddings[texts[i + idx]] = emb

  0%|          | 0/1252 [00:00<?, ?it/s]

In [8]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
bert_model = BertModel.from_pretrained('DeepPavlov/rubert-base-cased', device_map='cuda:0')

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
cls_embeddings.shape

(30, 768)

In [25]:
features['mean_title_embedding'] = merged_data.groupby('viewer_uid')['title'].apply(lambda x: np.array([embeddings[a] for a in x])).agg(lambda x: np.mean(x, 0)).reset_index(drop=True)
features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'].values)
features.drop('mean_title_embedding', axis=1, inplace=True)

  features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'].values)
  features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'].values)
  features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'].values)
  features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'].values)
  features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'].values)
  features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'].values)
  features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'].values)
  features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'].values)
  features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'].values)
  features[[f'feature_{x}' for x in range(768)]] = np.stack(features['mean_title_embedding'

In [91]:
many_features = pd.merge(features, watched_features, left_index=True, right_index=True)

In [94]:
X_train_age, X_val_age, y_train_age, y_val_age = train_test_split(many_features.drop(['age_class', 
                                                                                 'sex', 
                                                                                 'id',
                                                                                'fav_author'], axis=1), 
                                                                  many_features.age_class)
cat_features = ['fav_author', 'common_category', 'region', 'most_watched_category']
train_pool_age = Pool(X_train_age, y_train_age, cat_features=cat_features)
val_pool_age = Pool(X_val_age, y_val_age, cat_features=cat_features)

In [95]:
model_age = CatBoostClassifier(task_type='GPU', iterations=1000, 
                               class_weights=compute_class_weight(class_weight="balanced", 
                                                                  classes=list(range(4)),
                                                                  y=features.age_class))
model_age.fit(train_pool_age, eval_set=val_pool_age, verbose=100)

Learning rate set to 0.172583
0:	learn: 1.3500038	test: 1.3507650	best: 1.3507650 (0)	total: 28.9ms	remaining: 28.8s
100:	learn: 1.1380474	test: 1.1908661	best: 1.1908661 (100)	total: 2.48s	remaining: 22.1s
200:	learn: 1.0896933	test: 1.1844968	best: 1.1843640 (198)	total: 4.87s	remaining: 19.4s
300:	learn: 1.0519484	test: 1.1857677	best: 1.1841032 (247)	total: 7.29s	remaining: 16.9s
400:	learn: 1.0195118	test: 1.1875628	best: 1.1841032 (247)	total: 9.73s	remaining: 14.5s
500:	learn: 0.9905956	test: 1.1915448	best: 1.1841032 (247)	total: 12.2s	remaining: 12.1s
600:	learn: 0.9651814	test: 1.1951682	best: 1.1841032 (247)	total: 14.6s	remaining: 9.71s
700:	learn: 0.9421339	test: 1.2003165	best: 1.1841032 (247)	total: 17s	remaining: 7.26s
800:	learn: 0.9208910	test: 1.2064250	best: 1.1841032 (247)	total: 19.4s	remaining: 4.83s
900:	learn: 0.8997781	test: 1.2120791	best: 1.1841032 (247)	total: 21.9s	remaining: 2.4s
999:	learn: 0.8806461	test: 1.2171153	best: 1.1841032 (247)	total: 24.3s	rem

<catboost.core.CatBoostClassifier at 0x7f18b554ee30>

In [97]:
X_train_sex, X_val_sex, y_train_sex, y_val_sex = train_test_split(features.drop(['age_class', 
                                                                                 'sex', 
                                                                                 'id'], axis=1), 
                                                                  features.sex)

train_pool_sex = Pool(X_train_sex, y_train_sex, cat_features=cat_features)
val_pool_sex = Pool(X_val_sex, y_val_sex, cat_features=cat_features)

In [98]:
model_sex = CatBoostClassifier(task_type='GPU', iterations=1000, loss_function='Logloss')
model_sex.fit(train_pool_sex, eval_set=val_pool_sex, verbose=100)

Learning rate set to 0.046929
0:	learn: 0.6813817	test: 0.6811203	best: 0.6811203 (0)	total: 34ms	remaining: 33.9s
100:	learn: 0.5236979	test: 0.5231268	best: 0.5231268 (100)	total: 3.21s	remaining: 28.6s
200:	learn: 0.5112291	test: 0.5140121	best: 0.5140121 (200)	total: 6.25s	remaining: 24.9s
300:	learn: 0.5036682	test: 0.5095266	best: 0.5095266 (300)	total: 9.23s	remaining: 21.4s
400:	learn: 0.4977658	test: 0.5069317	best: 0.5069317 (400)	total: 12.2s	remaining: 18.2s
500:	learn: 0.4930039	test: 0.5051512	best: 0.5051512 (500)	total: 15.2s	remaining: 15.1s
600:	learn: 0.4889117	test: 0.5039032	best: 0.5039032 (600)	total: 18.1s	remaining: 12s
700:	learn: 0.4852089	test: 0.5029737	best: 0.5029737 (700)	total: 21.1s	remaining: 9.01s
800:	learn: 0.4817801	test: 0.5022230	best: 0.5022230 (800)	total: 24.1s	remaining: 5.99s
900:	learn: 0.4785141	test: 0.5016798	best: 0.5016798 (900)	total: 27.1s	remaining: 2.98s
999:	learn: 0.4755397	test: 0.5012238	best: 0.5012238 (999)	total: 30.1s	rema

<catboost.core.CatBoostClassifier at 0x7f18b5529420>

In [100]:
y_pred_age = model_age.predict(X_val_age)
y_pred_sex = model_sex.predict(X_val_sex)

f1_weighted = f1_score(y_val_age, y_pred_age, average='weighted')
accuracy = accuracy_score(y_val_sex, y_pred_sex)

final_score = 0.7 * f1_weighted + 0.3 * accuracy
print(f'Weighted F1 = {f1_weighted:.4f} \nAccuracy = {accuracy:.4f} \nFinal Score = {final_score:.4f}')

Weighted F1 = 0.4273 
Accuracy = 0.7533 
Final Score = 0.5251


In [111]:
test = pd.read_csv('/kaggle/input/rutube-videos/test_events.csv')
test.head()

Unnamed: 0,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid
0,2024-06-01 15:26:44+03:00,Udmurtiya Republic,desktop,browser,Windows,Yandex Browser,2593,video_212730,22206
1,2024-06-01 12:25:29+03:00,Adygeya Republic,smartphone,mobile app,Android,Rutube,960,video_235114,34531
2,2024-06-01 17:23:12+03:00,Astrakhan Oblast,smartphone,mobile app,Android,Rutube,4695,video_26520,25830
3,2024-06-01 15:37:37+03:00,Khakasiya Republic,smartphone,browser,Android,Chrome Mobile,2490,video_465561,14838
4,2024-06-01 21:30:11+03:00,Moscow,smartphone,mobile app,Android,Rutube,1117,video_102934,13718


In [112]:
test_merged_data = pd.merge(test, video, on='rutube_video_id')
test_merged_data['duration'] = test_merged_data['duration'].apply(lambda x: x / 1000)

In [114]:
test_merged_data['event_datetime'] = test_merged_data['event_timestamp']
test_merged_data['event_timestamp'] = test_merged_data['event_timestamp'].apply(to_timestamp)

In [115]:
test_merged_data.head()

Unnamed: 0,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid,title,category,duration,author_id,event_datetime
0,1717256000.0,Udmurtiya Republic,desktop,browser,Windows,Yandex Browser,2593,video_212730,22206,Отчаянные Домохозяйки 7 сезон 21 серия,Сериалы,2610.785,1089828,2024-06-01 15:26:44+03:00
1,1717245000.0,Adygeya Republic,smartphone,mobile app,Android,Rutube,960,video_235114,34531,Вот как воевали США во Второй мировой войне!,Разное,1080.32,1009406,2024-06-01 12:25:29+03:00
2,1717263000.0,Astrakhan Oblast,smartphone,mobile app,Android,Rutube,4695,video_26520,25830,"Бесподобный мистер Фокс (мультфильм, 2009)",Фильмы,5208.416,1090779,2024-06-01 17:23:12+03:00
3,1717256000.0,Khakasiya Republic,smartphone,browser,Android,Chrome Mobile,2490,video_465561,14838,2- Вышивальные планы на 2024г,Хобби,2556.011,1017105,2024-06-01 15:37:37+03:00
4,1717277000.0,Moscow,smartphone,mobile app,Android,Rutube,1117,video_102934,13718,"Суперниндзя. Дети, 3 выпуск",Телепередачи,8061.64,1009210,2024-06-01 21:30:11+03:00


In [116]:
mentions = {}
sessions = {}
count_sessions = []
mean_duration = []
srtd = test_merged_data.sort_values('event_timestamp')
for uid in tqdm(test_merged_data['viewer_uid'].unique()):
    mentions[uid] = srtd['event_timestamp'][srtd['viewer_uid'] == uid].tolist()
    first_idx, last_idx = 0, 0
    sessions[uid] = []
    duration = 0
    for i, mention in enumerate(mentions[uid]):
        if mention - mentions[uid][last_idx] <= 1800:
            last_idx = i
        else:
            sessions[uid].append([mentions[uid][first_idx], mentions[uid][last_idx]])
            duration += mentions[uid][last_idx] - mentions[uid][first_idx]
            first_idx = i
            last_idx = i
    sessions[uid].append([mentions[uid][first_idx], mentions[uid][last_idx]])
    mean_duration.append(duration / len(sessions[uid]))
    count_sessions.append(len(sessions[uid]))

  0%|          | 0/60004 [00:00<?, ?it/s]

In [117]:
def get_day(s: str):
    return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S%z').day

In [118]:
num_days_online = []
srtd = test_merged_data.sort_values('event_datetime')
for uid in tqdm(test_merged_data['viewer_uid'].unique()):
    events = srtd['event_datetime'][srtd['viewer_uid'] == uid].tolist()
    num_days = 0
    now_day = 1
    for event in events:
        if get_day(event) != now_day:
            num_days += 1
            now_day = get_day(event)
    num_days_online.append(num_days)

  0%|          | 0/60004 [00:00<?, ?it/s]

In [119]:
test_features = pd.DataFrame({'id': test_merged_data['viewer_uid'].unique().tolist()})
gb = test_merged_data.groupby('viewer_uid')
test_features['all_time'] = gb['total_watchtime'].agg('sum').reset_index(drop=True)
test_features['mean_per_session'] = mean_duration
test_features['count_sessions'] = count_sessions
test_features['mean_per_day'] = test_features['all_time'] / num_days_online

In [120]:
test_features['common_category'] = gb['category'].agg(pd.Series.mode).apply(lambda x: x if not isinstance(x, np.ndarray) else x[0]).reset_index(drop=True)

In [121]:
test_features.head()

Unnamed: 0,id,all_time,mean_per_session,count_sessions,mean_per_day,common_category
0,22206,14419,4.733333,15,4806.333333,Интервью
1,34531,92,0.0,7,18.4,Интервью
2,25830,3470,471.071429,14,867.5,Интервью
3,14838,4478,0.0,3,2239.0,Аниме
4,13718,73422,315.666667,12,18355.5,Сериалы


In [122]:
most_popular = []
gb = test_merged_data.groupby(['viewer_uid', 'category'])['total_watchtime'].agg('sum')
for uid in tqdm(test_merged_data['viewer_uid'].unique()):
    maxval, bestcat = 0, ''
    for cat in gb[uid].index:
        if gb[uid][cat] > maxval:
            maxval = gb[uid][cat]
            bestcat = cat
    most_popular.append(bestcat)

  0%|          | 0/60004 [00:00<?, ?it/s]

In [123]:
test_features['most_watched_category'] = most_popular

In [124]:
test_features['fav_author'] = test_merged_data.groupby('viewer_uid')['author_id'].agg(pd.Series.mode).apply(lambda x: x if not isinstance(x, np.ndarray) else x[0]).reset_index(drop=True)

In [125]:
test_merged_data['%wt'] = test_merged_data['total_watchtime']/test_merged_data['duration']
test_merged_data['watched_2x'] = test_merged_data['%wt'] > 2
test_merged_data['watched_3x'] = test_merged_data['%wt'] > 3
test_merged_data['watched_10x'] = test_merged_data['%wt'] > 10
test_merged_data['watched_50x'] = test_merged_data['%wt'] > 50
test_merged_data['watched_100x'] = test_merged_data['%wt'] > 100
test_merged_data['watched_95'] = test_merged_data['%wt'] > 0.95
test_merged_data['watched_90'] = test_merged_data['%wt'] > 0.90
test_merged_data['watched_80'] = test_merged_data['%wt'] > 0.80
test_merged_data['watched_50'] = test_merged_data['%wt'] > 0.50

test_merged_data['skipped_20'] = test_merged_data['%wt'] < 0.20
test_merged_data['skipped_10'] = test_merged_data['%wt'] < 0.10
test_merged_data['skipped_5'] = test_merged_data['%wt'] < 0.05

In [126]:
x1 = test_merged_data.groupby('viewer_uid')['watched_2x'].apply(lambda x: x.sum() / len(x)) 
x2 = test_merged_data.groupby('viewer_uid')['watched_3x'].apply(lambda x: x.sum() / len(x)) 
x3 = test_merged_data.groupby('viewer_uid')['watched_10x'].apply(lambda x: x.sum() / len(x)) 
x4 = test_merged_data.groupby('viewer_uid')['watched_50x'].apply(lambda x: x.sum() / len(x)) 
x5 = test_merged_data.groupby('viewer_uid')['watched_100x'].apply(lambda x: x.sum() / len(x)) 
x6 = test_merged_data.groupby('viewer_uid')['watched_95'].apply(lambda x: x.sum() / len(x)) 
x7 = test_merged_data.groupby('viewer_uid')['watched_90'].apply(lambda x: x.sum() / len(x)) 
x8 = test_merged_data.groupby('viewer_uid')['watched_80'].apply(lambda x: x.sum() / len(x)) 
x9 = test_merged_data.groupby('viewer_uid')['watched_50'].apply(lambda x: x.sum() / len(x)) 
x10 = test_merged_data.groupby('viewer_uid')['skipped_20'].apply(lambda x: x.sum() / len(x)) 
x11 = test_merged_data.groupby('viewer_uid')['skipped_10'].apply(lambda x: x.sum() / len(x)) 
x12 = test_merged_data.groupby('viewer_uid')['skipped_5'].apply(lambda x: x.sum() / len(x))

In [127]:
test_watched_features = x1 
for df in [x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12]: 
    test_watched_features = pd.merge(test_watched_features, df, on='viewer_uid') 
test_watched_features = test_watched_features.reset_index()
len(test_watched_features)

60004

In [128]:
texts = test_merged_data['title'].unique().tolist()

# Параметры для батчевой обработки
batch_size = 500  # Размер батча
embeddings = {}

# Получение эмбеддингов батчами
with torch.no_grad():  # Отключаем градиенты для оптимизации памяти
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True).to(torch.device('cuda'))
        outputs = bert_model(**inputs)
        
        # Берем эмбеддинг [CLS] токена для каждого текста в батче
        cls_embeddings = outputs.last_hidden_state.mean(1).cpu().numpy()
        for idx, emb in enumerate(cls_embeddings):
            embeddings[texts[i + idx]] = emb

  0%|          | 0/140 [00:00<?, ?it/s]

In [129]:
test_features['mean_title_embedding'] = test_merged_data.groupby('viewer_uid')['title'].apply(lambda x: np.array([embeddings[a] for a in x])).agg(lambda x: np.mean(x, 0)).reset_index(drop=True)
test_features[[f'feature_{x}' for x in range(768)]] = np.stack(test_features['mean_title_embedding'].values)
test_features.drop('mean_title_embedding', axis=1, inplace=True)

  test_features['mean_title_embedding'] = test_merged_data.groupby('viewer_uid')['title'].apply(lambda x: np.array([embeddings[a] for a in x])).agg(lambda x: np.mean(x, 0)).reset_index(drop=True)
  test_features[[f'feature_{x}' for x in range(768)]] = np.stack(test_features['mean_title_embedding'].values)
  test_features[[f'feature_{x}' for x in range(768)]] = np.stack(test_features['mean_title_embedding'].values)
  test_features[[f'feature_{x}' for x in range(768)]] = np.stack(test_features['mean_title_embedding'].values)
  test_features[[f'feature_{x}' for x in range(768)]] = np.stack(test_features['mean_title_embedding'].values)
  test_features[[f'feature_{x}' for x in range(768)]] = np.stack(test_features['mean_title_embedding'].values)
  test_features[[f'feature_{x}' for x in range(768)]] = np.stack(test_features['mean_title_embedding'].values)
  test_features[[f'feature_{x}' for x in range(768)]] = np.stack(test_features['mean_title_embedding'].values)
  test_features[[f'feature_

In [130]:
test_many_features = many_features = pd.merge(test_features, test_watched_features, left_index=True, right_index=True)

In [132]:
test_many_features.head()

Unnamed: 0,id,all_time,mean_per_session,count_sessions,mean_per_day,common_category,most_watched_category,fav_author,feature_0,feature_1,...,watched_10x,watched_50x,watched_100x,watched_95,watched_90,watched_80,watched_50,skipped_20,skipped_10,skipped_5
0,22206,14419,4.733333,15,4806.333333,Интервью,Сериалы,1010000,-0.046415,-0.186828,...,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.333333,0.222222,0.222222,0.222222
1,34531,92,0.0,7,18.4,Интервью,Разное,1011365,-0.340499,-0.329031,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,25830,3470,471.071429,14,867.5,Интервью,Фильмы,1019549,0.552662,-0.405162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
3,14838,4478,0.0,3,2239.0,Аниме,Хобби,1028890,0.205325,0.007161,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
4,13718,73422,315.666667,12,18355.5,Сериалы,Телепередачи,1084744,-0.249019,-0.49871,...,0.0,0.0,0.0,0.019231,0.038462,0.173077,0.576923,0.173077,0.096154,0.038462


In [133]:
test_many_features.drop(['id'], axis=1, inplace=True)

In [134]:
model_age.predict(test_many_features)

CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=774]=-0.3888133466243744 : cat_features must be integer or string, real number values and NaN values should be converted to string.