In [23]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, declarative_base
import pandas as pd
from config import DATABASE_URL

In [24]:
SQLALCHEMY_DATABASE_URL = DATABASE_URL

engine = create_engine(SQLALCHEMY_DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

Base = declarative_base()

In [25]:
# грузим информацию по юзерам
user_data_select = "SELECT * FROM public.user_data" 
user_data = pd.read_sql(user_data_select, engine)
# грузим информацию по постам
post_text_select = "SELECT * FROM public.post_text_df" 
post_text_data = pd.read_sql(post_text_select, engine)
# грузим информацию по фиду с учётом дисбаланса таргета
feed_data_select = """SELECT * FROM (
    SELECT * FROM public.feed_data WHERE target = 1 AND action != 'like' LIMIT 2500000
) AS subquery1
UNION ALL
SELECT * FROM (
    SELECT * FROM public.feed_data WHERE target = 0 AND action != 'like' LIMIT 2500000
) AS subquery2
LIMIT 5000000"""
feed_data = pd.read_sql(feed_data_select, engine)

In [26]:
# Обработка информации полльзователей с заменой на наиболее подходящие типы данных
def prepare_user_data(data: pd.DataFrame) -> pd.DataFrame:
    data["gender"] = data["gender"].astype(bool)
    data["os"] = data["os"].astype(bool)
    data["source"] = data["source"].astype(bool)
    # ONE HOT ENCODING стран
    tmp = pd.get_dummies(data["country"], prefix="country", drop_first=True)
    data = data.drop("country", axis=1)
    data = pd.concat([data, tmp], axis=1) 

    # ONE HOT ENCODING экспериментальных груп (было решено их оставить)
    tmp = pd.get_dummies(data["exp_group"], prefix="exp_group", drop_first=True)
    data = data.drop("exp_group", axis=1)
    data = pd.concat([data, tmp], axis=1)

    data["age"] = data["age"].astype("uint8")
    data["city"] = data["city"].astype("category")
    return data

prepared_user_data = user_data.copy()
prepared_user_data = prepare_user_data(prepared_user_data)

In [27]:
# prepared_user_data.to_sql(name="kzh_user_data", con=engine, index=False, if_exists="replace")
prepared_user_data.head(3)

Unnamed: 0,user_id,gender,age,city,os,source,country_Belarus,country_Cyprus,country_Estonia,country_Finland,country_Kazakhstan,country_Latvia,country_Russia,country_Switzerland,country_Turkey,country_Ukraine,exp_group_1,exp_group_2,exp_group_3,exp_group_4
0,200,True,34,Degtyarsk,True,True,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,201,False,37,Abakan,True,True,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,202,True,17,Smolensk,True,True,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import numpy as np
# здесь применяем TF-IDF к тестам постов и сокращаем размер при помощи PCA
def prepare_post_data_base(data: pd.DataFrame) -> pd.DataFrame:
    vectorizer = TfidfVectorizer(min_df=0.025, dtype=np.float32)
    pca = PCA(n_components=32)

    tmp = pd.get_dummies(data["topic"], prefix="topic", drop_first=True)
    data = data.drop("topic", axis=1)
    data = pd.concat([data, tmp], axis=1)
    
    tmp = vectorizer.fit_transform(data["text"])
    tmp = pd.DataFrame(tmp.toarray())
    tmp = pd.DataFrame(pca.fit_transform(tmp), columns=["pca_0", "pca_1", "pca_2",
        "pca_3", "pca_4", "pca_5", "pca_6", "pca_7", "pca_8", "pca_9", "pca_10", 
        "pca_11", "pca_12", "pca_13", "pca_14", "pca_15", "pca_16", "pca_17", 
        "pca_18", "pca_19", "pca_20", "pca_21", "pca_22", "pca_23", "pca_24", 
        "pca_25", "pca_26", "pca_27", "pca_28", "pca_29", "pca_30", "pca_31"], 
        dtype=np.float16)
    data = data.drop("text", axis=1)
    data = pd.concat([data, tmp], axis=1)
    return data

prepared_post_data_base = post_text_data.copy()
prepared_post_data_base = prepare_post_data_base(prepared_post_data_base)

In [29]:
# prepared_post_data_base.to_sql(name="kzh_post_data_base", con=engine, index=False, if_exists="replace")
prepared_post_data_base.head(3)

Unnamed: 0,post_id,topic_covid,topic_entertainment,topic_movie,topic_politics,topic_sport,topic_tech,pca_0,pca_1,pca_2,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,1,0,0,0,0,0,0,-0.196655,-0.273438,0.050079,...,-0.03302,0.047882,0.020782,-0.016846,0.131836,-0.082275,-0.050507,-0.006615,-0.002239,0.039001
1,2,0,0,0,0,0,0,-0.207153,-0.306396,-0.008377,...,-0.0495,-0.025162,0.018646,-0.000712,0.021286,-0.105652,-0.024261,0.031403,-0.027313,-0.019104
2,3,0,0,0,0,0,0,-0.222656,-0.220703,0.088928,...,-0.10083,0.014076,0.027237,0.059448,0.079529,0.00371,0.022095,-0.027191,-0.05246,0.003906


In [30]:
def process_feed(data: pd.DataFrame) -> pd.DataFrame:
    # просто приводим к легковесным типам и разбиваем дату
    data["user_id"] = data["user_id"].astype('uint32')
    data["post_id"] = data["post_id"].astype('uint32')
    data["target"] = data["target"].astype('bool')
    data["timestamp"] = pd.to_datetime(data["timestamp"])
    data = data.sort_values(by="timestamp")
    data["month"] = data["timestamp"].dt.month.astype('uint8')
    data["day"] = data["timestamp"].dt.day.astype('uint8')
    data["hour"] = data["timestamp"].dt.hour.astype('uint8')
    data["minute"] = data["timestamp"].dt.minute.astype('uint8')
    data = data.drop(["timestamp","action"], axis=1)
    return data

prepared_feed_data = feed_data.copy()
prepared_feed_data = process_feed(prepared_feed_data)

In [31]:
prepared_feed_data.head(3)

Unnamed: 0,user_id,post_id,target,month,day,hour,minute
563612,1859,1498,True,10,1,6,1
1113000,8663,3837,True,10,1,6,1
328976,136194,1205,True,10,1,6,1


In [32]:
def merge_data(user_data: pd.DataFrame, post_data: pd.DataFrame, feed_data: pd.DataFrame) -> pd.DataFrame:
    # сращиваем наши таблицы
    merged = pd.merge(feed_data, user_data, on="user_id", how=("left"))
    merged = pd.merge(merged, post_data, on="post_id", how=("left"))
    return merged

train_table_base = merge_data(prepared_user_data, prepared_post_data_base, prepared_feed_data)
train_table_base.head(3)

Unnamed: 0,user_id,post_id,target,month,day,hour,minute,gender,age,city,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,1859,1498,True,10,1,6,1,False,19,Kyiv,...,-0.141724,-0.02565,-0.060822,0.090881,0.010231,-0.083313,-0.009613,-0.021179,-0.021072,0.080505
1,8663,3837,True,10,1,6,1,False,23,Volzhskiy,...,0.017715,-0.029526,-0.032959,0.019821,0.02121,0.073853,0.005337,0.058014,-0.01104,0.052643
2,136194,1205,True,10,1,6,1,False,24,Moscow,...,-0.034332,-0.050629,0.041321,0.080566,-0.018494,0.051727,-0.068542,-0.000511,0.04303,-0.037384


In [33]:
from sklearn.model_selection import train_test_split
X = train_table_base.drop(["target","user_id"], axis=1)
y = train_table_base["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

In [34]:
from catboost import CatBoostClassifier, Pool
pool_base = Pool(data=X_train, label=y_train, cat_features=["city", "post_id"])

In [35]:
import torch
cb_device = 'GPU' if torch.cuda.is_available() else 'CPU'

In [36]:
params = {'depth': 6,
          'random_seed': 42,
          'l2_leaf_reg': 1,
          'iterations': 1000,
          "learning_rate": 0.1,
          "bootstrap_type": 'Bernoulli',
          'grow_policy': 'Lossguide'
          } # лучшие параметры согласнно кросс валидации
catboost = CatBoostClassifier(**params, eval_metric = 'AUC:hints=skip_train~false', task_type=cb_device)
control_model = catboost.fit(pool_base)
control_model.save_model("model_control", format="cbm")

Default metric period is 5 because AUC is/are not implemented for GPU


0:	learn: 0.7276595	total: 152ms	remaining: 2m 32s
1:	total: 260ms	remaining: 2m 9s
2:	total: 368ms	remaining: 2m 2s
3:	total: 486ms	remaining: 2m 1s
4:	total: 591ms	remaining: 1m 57s
5:	learn: 0.7567231	total: 720ms	remaining: 1m 59s
6:	total: 825ms	remaining: 1m 56s
7:	total: 930ms	remaining: 1m 55s
8:	total: 1.03s	remaining: 1m 53s
9:	total: 1.14s	remaining: 1m 53s
10:	learn: 0.7641775	total: 1.27s	remaining: 1m 54s
11:	total: 1.37s	remaining: 1m 53s
12:	total: 1.48s	remaining: 1m 52s
13:	total: 1.58s	remaining: 1m 51s
14:	total: 1.69s	remaining: 1m 50s
15:	learn: 0.7687840	total: 1.81s	remaining: 1m 51s
16:	total: 1.92s	remaining: 1m 50s
17:	total: 2.02s	remaining: 1m 50s
18:	total: 2.12s	remaining: 1m 49s
19:	total: 2.23s	remaining: 1m 49s
20:	learn: 0.7721304	total: 2.35s	remaining: 1m 49s
21:	total: 2.45s	remaining: 1m 48s
22:	total: 2.55s	remaining: 1m 48s
23:	total: 2.65s	remaining: 1m 47s
24:	total: 2.75s	remaining: 1m 47s
25:	learn: 0.7751736	total: 2.88s	remaining: 1m 47s
2

Теперь вместо TF-IDF для векторизации используем токенезатор Роберты (или дистилберта/обычного берта)

In [37]:
import torch.nn as nn
from warnings import filterwarnings
filterwarnings('ignore')
from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel

In [38]:
def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [39]:
tokenizer, _ = get_model('roberta') # загружаем токенизатор
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
tokenized_text = post_text_data['text'].apply(lambda x: tokenizer.encode_plus(x, padding=True, add_special_tokens=True,
                                                                    max_length=512, pad_to_multiple_of=512, 
                                                                    return_token_type_ids=False, truncation=True)['input_ids'])
df_lists = pd.DataFrame(tokenized_text.tolist(), columns=[f'vec_{i}' for i in range(len(tokenized_text[0]))], dtype=np.uint16)

In [41]:
def prepare_post_data_emb(data: pd.DataFrame, embedings) -> pd.DataFrame:
    pca = PCA(n_components=32)

    tmp = pd.get_dummies(data["topic"], prefix="topic", drop_first=True)
    data = data.drop("topic", axis=1)
    data = pd.concat([data, tmp], axis=1)
    
    tmp = embedings
    tmp = pd.DataFrame(pca.fit_transform(tmp), columns=["pca_0", "pca_1", "pca_2", 
        "pca_3", "pca_4", "pca_5", "pca_6", "pca_7", "pca_8", "pca_9", "pca_10", 
        "pca_11", "pca_12", "pca_13", "pca_14", "pca_15", "pca_16", "pca_17", 
        "pca_18", "pca_19", "pca_20", "pca_21", "pca_22", "pca_23", "pca_24", 
        "pca_25", "pca_26", "pca_27", "pca_28", "pca_29", "pca_30", "pca_31"], 
        dtype=np.float16)
    data = data.drop("text", axis=1)
    data = pd.concat([data, tmp], axis=1)
    return data

prepared_post_data_emb = post_text_data.copy()
prepared_post_data_emb = prepare_post_data_emb(prepared_post_data_emb, df_lists)

In [42]:
# prepared_post_data_emb.to_sql(name="kzh_post_data_emb", con=engine, index=False, if_exists="replace")
prepared_post_data_emb.head(3)

Unnamed: 0,post_id,topic_covid,topic_entertainment,topic_movie,topic_politics,topic_sport,topic_tech,pca_0,pca_1,pca_2,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,1,0,0,0,0,0,0,23888.0,-28368.0,29888.0,...,-10376.0,-9216.0,-9648.0,4436.0,-9368.0,10584.0,1327.0,-332.75,-1332.0,4120.0
1,2,0,0,0,0,0,0,48448.0,58048.0,-21104.0,...,-16464.0,17184.0,-10072.0,17616.0,-18416.0,-1133.0,10680.0,9336.0,11960.0,-4500.0
2,3,0,0,0,0,0,0,53664.0,-29488.0,28736.0,...,-5056.0,-5588.0,-26176.0,7888.0,4984.0,13024.0,-8344.0,6972.0,31600.0,31040.0


In [43]:
train_table_emb = merge_data(prepared_user_data, prepared_post_data_emb, prepared_feed_data)
train_table_emb.head(3)

Unnamed: 0,user_id,post_id,target,month,day,hour,minute,gender,age,city,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,1859,1498,True,10,1,6,1,False,19,Kyiv,...,-686.0,-16048.0,26672.0,12240.0,-18560.0,19040.0,6764.0,4100.0,7608.0,-7876.0
1,8663,3837,True,10,1,6,1,False,23,Volzhskiy,...,-14520.0,-8320.0,-6264.0,332.75,6428.0,-10000.0,-1360.0,7996.0,3708.0,1211.0
2,136194,1205,True,10,1,6,1,False,24,Moscow,...,5136.0,-8008.0,-1304.0,-2568.0,-2718.0,15408.0,-6796.0,-9960.0,15896.0,31728.0


In [44]:
X = train_table_emb.drop(["target","user_id"], axis=1)
y = train_table_emb["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

In [45]:
pool_emb = Pool(data=X_train, label=y_train, cat_features=["city", "post_id"])

In [46]:
params = {'depth': 6,
          'random_seed': 42,
          'l2_leaf_reg': 1,
          'iterations': 1000,
          "learning_rate": 0.1,
          "bootstrap_type": 'Bernoulli',
          'grow_policy': 'Lossguide'
          } # лучшие найденные параметры
catboost = CatBoostClassifier(**params, eval_metric = 'AUC:hints=skip_train~false', task_type="GPU")
test_model = catboost.fit(pool_emb)
test_model.save_model("model_test", format="cbm")

Default metric period is 5 because AUC is/are not implemented for GPU


0:	learn: 0.7276595	total: 153ms	remaining: 2m 33s
1:	total: 267ms	remaining: 2m 13s
2:	total: 375ms	remaining: 2m 4s
3:	total: 493ms	remaining: 2m 2s
4:	total: 597ms	remaining: 1m 58s
5:	learn: 0.7567231	total: 725ms	remaining: 2m
6:	total: 827ms	remaining: 1m 57s
7:	total: 933ms	remaining: 1m 55s
8:	total: 1.04s	remaining: 1m 54s
9:	total: 1.15s	remaining: 1m 53s
10:	learn: 0.7641775	total: 1.28s	remaining: 1m 54s
11:	total: 1.38s	remaining: 1m 53s
12:	total: 1.49s	remaining: 1m 53s
13:	total: 1.6s	remaining: 1m 52s
14:	total: 1.71s	remaining: 1m 52s
15:	learn: 0.7687840	total: 1.84s	remaining: 1m 53s
16:	total: 1.95s	remaining: 1m 52s
17:	total: 2.06s	remaining: 1m 52s
18:	total: 2.16s	remaining: 1m 51s
19:	total: 2.26s	remaining: 1m 50s
20:	learn: 0.7721283	total: 2.39s	remaining: 1m 51s
21:	total: 2.49s	remaining: 1m 50s
22:	total: 2.59s	remaining: 1m 50s
23:	total: 2.69s	remaining: 1m 49s
24:	total: 2.8s	remaining: 1m 49s
25:	learn: 0.7751601	total: 2.92s	remaining: 1m 49s
26:	to

Модели сохраненны, данные загружены на сервер. Можно приступать к тестированнию