In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
data = pd.read_csv('tsn_concatenated_pos_neg.csv', index_col=0)

In [3]:
len(data)

63494

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
transformer = TfidfVectorizer()
result = transformer.fit_transform(data['msg_text'])

In [11]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(result, data['reaction_type'] ,test_size=0.3)

In [14]:
models = [RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier()]
for model in models:
    model.fit(X_train, y_train)
    print(f'Model {model}, f1 score:  {f1_score(y_true=y_test, y_pred = model.predict(X_test))}')

Model RandomForestClassifier(), f1 score:  0.8617758706057106
Model AdaBoostClassifier(), f1 score:  0.8321131027518304
Model GradientBoostingClassifier(), f1 score:  0.8415590684204174


In [29]:
pd.Series(models[0].feature_importances_, transformer.get_feature_names_out()).sort_values()[-20:]

та            0.002995
укриттях      0.003025
обстрілу      0.003027
українські    0.003069
що            0.003257
росіяни       0.003318
внаслідок     0.003420
до            0.003556
загиблих      0.003852
по            0.003987
обстріляли    0.004069
україни       0.004137
для           0.004382
на            0.005489
укриття       0.005621
зсу           0.006777
окупанти      0.006897
область       0.010278
повітряна     0.012770
тривога       0.027464
dtype: float64

In [30]:
bad_ids = y_test != models[0].predict(X_test)

In [38]:
bad_ids.loc[lambda x: x == True]

3169     True
59041    True
47673    True
38741    True
18365    True
         ... 
13572    True
11378    True
20396    True
36555    True
39790    True
Name: reaction_type, Length: 3350, dtype: bool

In [44]:
data.iloc[59041]

msg_text         київська область звільнена від росіян джерело ...
reaction_type                                                    1
Name: 59041, dtype: object

In [45]:
print(f1_score(y_true=y_train, y_pred = models[0].predict(X_train)))

0.9988498352730177


In [46]:
from sklearn.model_selection import RandomizedSearchCV

params = {'n_estimators' : [25, 50, 100, 200], 'max_depth' : [5, 10,  25, None], 'max_features' : ['sqrt', 'log2', 0.3, None]}

cross_validated_model = RandomizedSearchCV(RandomForestClassifier(), params, n_jobs = -1).fit(X_train, y_train)