In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

In [2]:
stat = pd.read_parquet('video_stat.parquet')
stat.head(1)

Unnamed: 0,video_id,v_pub_datetime,v_total_comments,v_year_views,v_month_views,v_week_views,v_day_views,v_likes,v_dislikes,v_duration,...,v_category_popularity_percent_7_days,v_category_popularity_percent_30_days,v_long_views_1_days,v_long_views_7_days,v_long_views_30_days,row_number,title,description,category_id,author_id
0,5b98a8f2-99b6-4730-b3a2-33fc6791eefd,2009-12-31 21:00:01,0,18947,23,23,23,62,6,42.025,...,0.917423,0.917423,6,6,6,1,найк,найк дрессура,Животные,e4bf220f-3c8a-4804-88ee-8f76303a0415


In [3]:
for col in tqdm(stat.columns):
    if len(Counter(stat[col]))==1: 
        stat.drop(columns=[col], inplace=True)

  0%|          | 0/43 [00:00<?, ?it/s]

In [4]:
len(stat.columns)

40

In [5]:
train =pd.read_parquet('train.parquet')
train.head(1)

Unnamed: 0,event_timestamp,user_id,region,city,video_id,watchtime
0,2024-08-10 20:13:12+03:00,73dadec9-9189-43a8-a32f-906e549af7e0,13925c13-0e77-4548-acdc-09aa728e31d4,faeecbed-060a-4d40-be81-ce3bac90804c,98380888-2d8a-4d2f-9660-a00c3c5c0512,0


In [6]:
train.drop_duplicates(inplace=True)

In [32]:
test = pd.read_parquet('test.parquet')
test.head(1)

Unnamed: 0,event_timestamp,user_id,region,city,video_id
0,2024-08-10 11:45:52+03:00,00000a62-336d-4b71-bb8b-991c877ac678,e7a113d7-5be1-44c0-9e97-78fd27df336f,7a193a03-5883-4ef2-b69d-873f397ec104,62a44a87-29e5-464d-a9ff-78de16d4a7d7


In [8]:
sub = pd.read_csv('sample_submission.csv', index_col=0)
sub

Unnamed: 0,target
0,0
1,1
2,1
3,1
4,0
...,...
1334127,1
1334128,1
1334129,1
1334130,1


In [9]:
train = pd.merge(train, stat, on='video_id', how='left')

In [33]:
test = pd.merge(test, stat, on='video_id', how='left')

In [11]:
train['target'] = 0  # Инициализируем столбец target нулями

# Условие для v_duration > 300
mask_long = train['v_duration'] > 300
train.loc[mask_long, 'target'] = (train.loc[mask_long, 'watchtime'] > 0.25 * train.loc[mask_long, 'v_duration']).astype(int)

# Условие для v_duration <= 300
mask_short = train['v_duration'] <= 300
train.loc[mask_short, 'target'] = (train.loc[mask_short, 'watchtime'] > 30).astype(int)

train.drop(['watchtime'], axis=1, inplace=True)

In [12]:
train.drop(columns=['video_id', 'title', 'description', 'v_pub_datetime'], inplace=True)

In [13]:
cat_train = ['category_id', 'author_id', 'user_id', 'city', 'region']

In [14]:
lst=[]

for i in cat_train:
    for j in train[i]:
        lst.append(j)

for i in cat_train:
    for j in test[i]:
        lst.append(j)

In [15]:
le = LabelEncoder()

le.fit(lst)

for col in cat_train:
    train[col] = le.transform(train[col])

In [16]:
train['event_timestamp'] = pd.to_datetime(train['event_timestamp']).dt.tz_localize(None).astype('datetime64[ns]')

In [17]:
Counter(train.target)

Counter({1: 7468789, 0: 6685751})

In [18]:
X = train.drop(columns=['target'])
y = train['target']

In [19]:
# Разделение данных на обучающую и валидационную выборки
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

In [20]:
model = CatBoostClassifier(iterations=1000, od_wait=500, depth=11, task_type='GPU')

In [21]:
# Инициализация и обучение модели
model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=200)

# Оценка модели
accuracy_score(y_val, model.predict(X_val)), f1_score(y_val, model.predict(X_val))

Learning rate set to 0.032206
0:	learn: 0.6827168	test: 0.6827026	best: 0.6827026 (0)	total: 434ms	remaining: 7m 13s
200:	learn: 0.5497213	test: 0.5499387	best: 0.5499387 (200)	total: 1m 1s	remaining: 4m 2s
400:	learn: 0.5472550	test: 0.5479317	best: 0.5479317 (400)	total: 2m 4s	remaining: 3m 5s
600:	learn: 0.5458469	test: 0.5469394	best: 0.5469394 (600)	total: 3m 5s	remaining: 2m 3s
800:	learn: 0.5447337	test: 0.5462962	best: 0.5462962 (800)	total: 4m 8s	remaining: 1m 1s
999:	learn: 0.5437757	test: 0.5458608	best: 0.5458608 (999)	total: 5m 9s	remaining: 0us
bestTest = 0.5458607715
bestIteration = 999


(0.7107804280464077, 0.7513470470651047)

In [22]:
# визуализация валидационной выборки для обучения
# ps = PredefinedSplit(test_fold=[-1 if i in X_train.index else 0 for i in X.index])
# ps

In [23]:
# model = GridSearchCV(
#     estimator=CatBoostClassifier(),
#     param_grid={
#         'verbose': [500],
#         'task_type': ['GPU'],
#         'depth': [7, 9, None],
#         # 'loss_function': ['Logloss', None],
#         # 'learning_rate': [1e-3, None],
#         # 'class_weights': [dict(enumerate(weights)), None]
#     },
#     scoring='f1',
#     verbose=52,
#     cv=ps, 
# )

In [24]:
# model.fit(X, y)

In [25]:
# params = model.best_params_
# params

In [26]:
# accuracy_score(y_val, model.predict(X_val)), f1_score(y_val, model.predict(X_val))

In [27]:
# model.save_model('cat.cbm', format="cbm")

In [28]:
# model.load_model('cat.cbm', format="cbm")

In [34]:
test.drop(columns=['video_id', 'title', 'description', 'v_pub_datetime'], inplace=True)

In [35]:
test['event_timestamp'] = pd.to_datetime(test['event_timestamp']).dt.tz_localize(None).astype('datetime64[ns]')

In [36]:
for col in cat_train:
    test[col] = le.transform(test[col])

In [37]:
sub['target'] = model.predict(test)
sub

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1
...,...
1334127,0
1334128,0
1334129,0
1334130,0


In [38]:
sub.to_csv('sub_ml.csv')