In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from catboost import CatBoostClassifier


SQLALCHEMY_DATABASE_URL = "postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml"

engine = create_engine(SQLALCHEMY_DATABASE_URL)

In [2]:
user_query = "SELECT * FROM public.user_data"
user_df = pd.read_sql(user_query, engine)

post_query = "SELECT * FROM public.post_text_df"
post_df = pd.read_sql(post_query, engine)

feed_query = "SELECT * FROM public.feed_data LIMIT 1000000"
feed_df = pd.read_sql(feed_query, engine)

In [3]:
user_df

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads
...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic
163201,168549,0,18,Russia,Tula,2,Android,organic
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic
163203,168551,0,38,Russia,Moscow,3,iOS,organic


In [4]:
post_df

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


In [5]:
feed_df

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-12-09 20:29:15,10907,81,view,0
1,2021-12-09 20:29:54,10907,4187,view,0
2,2021-12-09 20:32:38,10907,5216,view,0
3,2021-12-09 20:34:52,10907,4762,view,0
4,2021-12-09 20:36:25,10907,5601,view,0
...,...,...,...,...,...
999995,2021-12-21 07:49:17,24321,6691,view,0
999996,2021-12-21 07:50:12,24321,5506,view,1
999997,2021-12-21 07:52:05,24321,5506,like,0
999998,2021-12-21 07:52:07,24321,2516,view,1


In [6]:
feed_df = feed_df[feed_df['action'] == 'view']

In [7]:
feed_df = feed_df.sort_values('timestamp').reset_index(drop=True)

In [8]:
total_data = feed_df.merge(user_df, how='left').merge(post_df, how='left')
total_data = total_data.drop(['timestamp', 'action', 'user_id', 'post_id'], axis=1)
total_data

Unnamed: 0,target,gender,age,country,city,exp_group,os,source,text,topic
0,0,1,22,Russia,Arkhangelsk,0,Android,ads,@JobaMoz celebrates #InternationalYouthDay2020...,covid
1,0,0,20,Russia,Moscow,0,Android,ads,People from all over the world travel to Wigto...,covid
2,0,0,31,Russia,Kemerovo,1,Android,ads,President of Brazil does not take #coronavirus...,covid
3,0,0,31,Russia,Kemerovo,1,Android,ads,"This movie is awful, I cant even be bothered t...",movie
4,0,1,22,Russia,Arkhangelsk,0,Android,ads,Women MPs reveal sexist taunts\n\nWomen MPs en...,politics
...,...,...,...,...,...,...,...,...,...,...
892528,0,0,34,Russia,Bogotol,2,iOS,ads,#Boeing says it is developing a hand-held wand...,covid
892529,1,1,19,Belarus,Mazyr,2,iOS,ads,Our Co-founder speaking about @PaymentGate e-R...,covid
892530,1,0,34,Russia,Bogotol,2,iOS,ads,Desailly backs Blues revenge trip\n\nMarcel De...,sport
892531,1,1,19,Belarus,Mazyr,2,iOS,ads,I dont know why I keep doing this to myself!! ...,movie


In [9]:
cat_cols = ['gender', 'age', 'country', 'city', 'exp_group', 'os', 'source', 'topic']
train_data = total_data[:700000]
test_data = total_data[700000:]

In [10]:
test_data

Unnamed: 0,target,gender,age,country,city,exp_group,os,source,text,topic
700000,0,1,16,Russia,Gus’-Khrustal’nyy,4,Android,ads,If we completely followed the Its my freedom. ...,covid
700001,0,1,20,Russia,Orsk,1,iOS,ads,Your coronavirus antibodies are disappearing. ...,covid
700002,0,0,17,Russia,Velikiy Novgorod,3,Android,ads,Wenger dejected as Arsenal slump\n\nArsenal ma...,sport
700003,0,1,44,Russia,Kamensk-Shakhtinskiy,4,Android,ads,The bots are working double overtime tonight. ...,covid
700004,0,0,35,Russia,Khasavyurt,4,Android,ads,COVID’s Cash Handling Dilemma Sees PFS EML Ste...,covid
...,...,...,...,...,...,...,...,...,...,...
892528,0,0,34,Russia,Bogotol,2,iOS,ads,#Boeing says it is developing a hand-held wand...,covid
892529,1,1,19,Belarus,Mazyr,2,iOS,ads,Our Co-founder speaking about @PaymentGate e-R...,covid
892530,1,0,34,Russia,Bogotol,2,iOS,ads,Desailly backs Blues revenge trip\n\nMarcel De...,sport
892531,1,1,19,Belarus,Mazyr,2,iOS,ads,I dont know why I keep doing this to myself!! ...,movie


In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700000 entries, 0 to 699999
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   target     700000 non-null  int64 
 1   gender     700000 non-null  int64 
 2   age        700000 non-null  int64 
 3   country    700000 non-null  object
 4   city       700000 non-null  object
 5   exp_group  700000 non-null  int64 
 6   os         700000 non-null  object
 7   source     700000 non-null  object
 8   text       700000 non-null  object
 9   topic      700000 non-null  object
dtypes: int64(4), object(6)
memory usage: 58.7+ MB


In [14]:
cat_cols = ['gender', 'country', 'city', 'exp_group', 'os', 'source', 'topic']
text_cols = ['text']
num_cols = ['age']
train_data = total_data[:700000]
test_data = total_data[700000:]
X_train = train_data.drop('target', axis=1)
y_train = train_data['target']
X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols),
        ('text', TfidfVectorizer(max_features=1000), 'text'),
        ('num', StandardScaler(), num_cols)
    ],
    remainder='passthrough'
)

catboost = CatBoostClassifier()

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', catboost)
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print("Train Accuracy:", train_score)
print("Test Accuracy:", test_score)


Learning rate set to 0.168963
0:	learn: 0.5753606	total: 180ms	remaining: 2m 59s
1:	learn: 0.4994899	total: 369ms	remaining: 3m 4s
2:	learn: 0.4505136	total: 567ms	remaining: 3m 8s
3:	learn: 0.4188232	total: 729ms	remaining: 3m 1s
4:	learn: 0.3970255	total: 919ms	remaining: 3m 2s
5:	learn: 0.3830171	total: 1.1s	remaining: 3m 3s
6:	learn: 0.3735856	total: 1.29s	remaining: 3m 2s
7:	learn: 0.3674185	total: 1.48s	remaining: 3m 3s
8:	learn: 0.3625897	total: 1.68s	remaining: 3m 5s
9:	learn: 0.3599026	total: 1.88s	remaining: 3m 6s
10:	learn: 0.3570489	total: 2.07s	remaining: 3m 6s
11:	learn: 0.3555330	total: 2.29s	remaining: 3m 8s
12:	learn: 0.3545089	total: 2.48s	remaining: 3m 8s
13:	learn: 0.3535513	total: 2.68s	remaining: 3m 8s
14:	learn: 0.3528627	total: 2.88s	remaining: 3m 9s
15:	learn: 0.3523984	total: 3.08s	remaining: 3m 9s
16:	learn: 0.3521052	total: 3.27s	remaining: 3m 9s
17:	learn: 0.3517968	total: 3.47s	remaining: 3m 9s
18:	learn: 0.3504650	total: 3.67s	remaining: 3m 9s
19:	learn: 



Train Accuracy: 0.8851
Test Accuracy: 0.8610160336150167


In [15]:
catboost.save_model('catboost_model', format='cbm')

In [16]:
model_path = '/home/arthur/code/startml/recsys_project/models/catboost_model'
from_file = CatBoostClassifier()  # здесь не указываем параметры, которые были при обучении, в дампе модели все есть
from_file.load_model(model_path)

<catboost.core.CatBoostClassifier at 0x7f709f552e20>

In [19]:
from_file.predict(X_train)

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=2]="Russia": Cannot convert 'b'Russia'' to float