In [77]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer
f1 = make_scorer(f1_score , average='macro')

In [11]:
import torch
device = "cuda"

In [12]:
from train import DisinformationBERT
from data import create_classification_dataset

In [23]:
model = DisinformationBERT().to(device)

In [49]:
df = pd.read_csv("train.csv")
labels = df['Suspicious_Level'].astype(int)
texts = df['Content'].astype(str)
train_encodings = model.tokenizer(list(texts), truncation=True, padding=True, max_length=512)

In [50]:
bert_embeddings = []
for idx in tqdm(range(df.shape[0])):
    input_ids, attn_mask = train_encodings["input_ids"][idx], train_encodings["attention_mask"][idx]

    with torch.no_grad():
        outputs = model.model(
            torch.tensor(input_ids, device=device)[None],
            attention_mask=torch.tensor(attn_mask, device=device)[None]
        ).pooler_output.squeeze(0).cpu().numpy()

        bert_embeddings.append(outputs)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 587/587 [00:09<00:00, 63.48it/s]


In [51]:
df["bert_embeddings"] = bert_embeddings

In [52]:
df.head()

Unnamed: 0,ChannelName,ChannelId,MessageId,Date,EditDate,Content,Suspicious_Level,bert_embeddings
0,boris_rozhin,1101806611,91626,2023-07-08 16:11:34,2023-07-08 16:11:47,Работа наших бойцов к югу от Артемовска. Работ...,2,"[0.39974168, -0.12330109, -0.06272041, -0.0505..."
1,sashakots,1109403194,40853,2023-07-08 16:44:44,2023-07-08 16:44:58,"Анкара нарушила договорённости, отпустив глава...",1,"[0.4269635, 0.053162936, -0.14207593, -0.04553..."
2,swodki,1144180066,280668,2023-07-09 02:00:23,2023-07-09 02:05:53,ЭТО ЕДИНСТВЕННЫЙ СПОСОБ ПОМОЧЬ НАМ! \n\nПополн...,1,"[0.10688506, 0.15902989, 0.17515673, -0.238599..."
3,boris_rozhin,1101806611,91573,2023-07-08 02:07:05,2023-07-08 02:07:19,МТ-ЛБ с 32-зарядной авиационной пусковой устан...,1,"[0.39733493, -0.14960964, -0.115425706, 0.2744..."
4,swodki,1144180066,280695,2023-07-09 07:01:49,2023-07-09 07:05:08,ЭТО ЕДИНСТВЕННЫЙ СПОСОБ ПОМОЧЬ НАМ! \n\nПополн...,1,"[0.10688506, 0.15902989, 0.17515673, -0.238599..."


In [53]:
np.random.seed(42)

In [67]:
X, y = bert_embeddings, df["Suspicious_Level"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, verbose=1)
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42, verbose=1)
xgb_classifier = XGBClassifier(objective='binary:logistic', use_label_encoder=False)

ensemble_classifier = VotingClassifier(
    estimators=[
        ('random_forest', rf_classifier),
        ('gradient_boosting', gb_classifier),
        ('xgboost', xgb_classifier)
    ],
    voting='soft'
)

In [71]:
param_dist_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

param_dist_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}

param_dist_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 1.0],
    'reg_lambda': [0, 0.1, 1.0],
}

In [80]:
random_search_rf = RandomizedSearchCV(rf_classifier, param_distributions=param_dist_rf, n_iter=10, cv=5, scoring=f1)
random_search_gb = RandomizedSearchCV(gb_classifier, param_distributions=param_dist_gb, n_iter=10, cv=5, scoring=f1)
random_search_xgb = RandomizedSearchCV(xgb_classifier, param_distributions=param_dist_xgb, n_iter=10, cv=5, scoring=f1)

In [81]:
random_search_rf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.6s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.6s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Do

In [None]:
random_search_gb.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.2787           34.83s
         2           0.2717           34.13s
         3           0.2661           33.41s
         4           0.2613           32.72s
         5           0.2572           32.02s
         6           0.2535           31.31s
         7           0.2502           30.59s
         8           0.2472           29.88s
         9           0.2444           29.16s
        10           0.2418           28.46s
        20           0.2224           21.34s
        30           0.2002           14.19s
        40           0.1845            7.09s
        50           0.1718            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.2871           34.86s
         2           0.2822           34.15s
         3           0.2780           33.47s
         4           0.2745           32.76s
         5           0.2712           32.05s
         6           0.2683           31.33s
        

In [None]:
random_search_xgb.fit(X_train, y_train)

In [None]:
best_params_rf = random_search_rf.best_params_
best_params_gb = random_search_gb.best_params_
best_params_xgb = random_search_xgb.best_params_

In [None]:
rf_classifier.set_params(**best_params_rf)
gb_classifier.set_params(**best_params_gb)
xgb_classifier.set_params(**best_params_xgb)

In [None]:
ensemble_classifier.fit(X, y)

In [None]:
df = pd.read_csv("test.csv")
texts = df['Content'].astype(str)
train_encodings = model.tokenizer(list(texts), truncation=True, padding=True, max_length=512)

In [None]:
bert_embeddings = []
for idx in tqdm(range(df.shape[0])):
    input_ids, attn_mask = train_encodings["input_ids"][idx], train_encodings["attention_mask"][idx]

    with torch.no_grad():
        outputs = model.model(
            torch.tensor(input_ids, device=device)[None],
            attention_mask=torch.tensor(attn_mask, device=device)[None]
        ).pooler_output.squeeze(0).cpu().numpy()

        bert_embeddings.append(outputs)

In [None]:
ensemble_predictions = ensemble_classifier.predict(bert_embeddings)

In [None]:
plt.hist(ensemble_predictions)

In [None]:
df = df.drop(columns=["Content", "ChannelId", "ChannelName", "Date", "EditDate"])
df["Suspicious_Level"] = ensemble_predictions
df.to_csv("second_submission.csv", index=False)

In [None]:
df2 = pd.read_csv("first_submission.csv")

In [None]:
plt.hist(df2["Suspicious_Level"])