In [1]:
import json
import pandas as pd


lines = []
with open('../ai-defence-summer-school-2024/messages/messages.jsonl') as f:
    lines = f.read().splitlines()

line_dicts = [json.loads(line) for line in lines]
df = pd.DataFrame(line_dicts)

print(df)

        source_id  message_id  \
0          253122  1415474509   
1          253122  1408345528   
2          253122  1394795522   
3          253122  1364623985   
4          253122  1364694325   
...           ...         ...   
373306     226841   776160744   
373307     226841   776160747   
373308     226841   775473825   
373309     226841   775473830   
373310     226841   775473839   

                                                     text  impressions  \
0       https://www.youtube.com/watch?v=eR9FIPXffUw&li...        945.0   
1       Плохо спится в белую петербургскую ночь. Котор...       1429.0   
2       «Давайте посмотрим внимательно, какую свободу ...       1158.0   
3       К добрым словам Владимира Гельмана в мой адрес...       1071.0   
4       Известный политолог, профессор университета Хе...       1087.0   
...                                                   ...          ...   
373306  Российские военные совместно с волонтерской ор...        512.0   
373307  При

In [2]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import nltk
import requests
import ast
import re
from nltk.corpus import stopwords
# from wordcloud import WordCloud
from tqdm.auto import tqdm
# from ftlangdetect import detect
from nltk import tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df.columns

Index(['source_id', 'message_id', 'text', 'impressions', 'reactions', 'shares',
       'comments', 'published_at', 'content_type'],
      dtype='object')

In [4]:
tqdm.pandas()

In [5]:
stopwords_ua_source_url = 'https://raw.githubusercontent.com/skupriienko/Ukrainian-Stopwords/master/stopwords_ua_list.txt'
stopwords_ru_source_url='https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt'
results_ua = requests.get(stopwords_ua_source_url)
results_ru = requests.get(stopwords_ru_source_url)
stopwords_en = stopwords.words('english')
stopwords_ua = ast.literal_eval(results_ua.text)
stopwords_ru = results_ru.text.split('\n')
stopwords_all=stopwords_ua+stopwords_ru+stopwords_en

In [6]:
df.fillna(0, inplace=True)

# Remove comments
messages_df_preprocessed = df[
    (df.content_type != 'COMMENT') | (df.text=='')].drop_duplicates(subset=['source_id', 'text'])

# Remove only urls
url_regex=re.compile(r"http[s]*\S+$")
is_url = messages_df_preprocessed.text.progress_apply(lambda x: bool(url_regex.match(x)))
only_url = messages_df_preprocessed[is_url]
# messages_df_preprocessed.loc[:,'url_only'] = is_url
messages_df_preprocessed=messages_df_preprocessed[~is_url]


  0%|          | 0/365599 [00:00<?, ?it/s]

In [7]:
word_tokenizer = tokenize.RegexpTokenizer(r'\w+')

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags=re.UNICODE)

In [8]:
def collapse_dots(input):
    input = re.sub("\.+", ".", input)
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", input)
        all_collapsed = input == output
        input = output
    return output

def preprocess_hard(input):
    words_list = []
    for token in word_tokenizer.tokenize(input):
        if token.lower() not in stopwords_all:
            words_list.append(token)
        result_str = " ".join(words_list)
    return result_str


def process(input, light=True):
    if not isinstance(input, str):
        return input
    
    input = " ".join(tokenize.sent_tokenize(input))
    
    # Remove links
    input = re.sub(r"http[s]*\S+$", "", input)
    # Remove new line tag
    input = re.sub(r"\n+", ". ", input)
    # Remove emojis
    input = re.sub(emoji_pattern, '', input)
    # Replace telegram link with dot
    input = re.sub(r"\bt\.me/\S+", ".", input)
    # Remove sentence containing 'subscribe'
    input = re.sub(r"\bПодпишитесь на\b.*?\.", '', input)
    # Remove underscore
    input = input.replace("_", "")
    # Remove symbol followed by dot
    for symb in ["!", ",", ":", ";", "?", "_"]:
        input = re.sub(rf"\{symb}\.", symb, input)
    # Remove hashtag
    input = re.sub(r"#\S+", "", input)
    # Remove user mentioning
    input = re.sub(r"@\S+", "", input)
    # Collapse dots
    input = collapse_dots(input)
    input = input.strip()
    
    if light:
        return input
    
    return preprocess_hard(input)

In [9]:
messages_df_preprocessed['processed_text'] = messages_df_preprocessed['text'].progress_apply(process)

  0%|          | 0/359539 [00:00<?, ?it/s]

In [10]:
train = pd.read_csv('../ai-defence-summer-school-2024/train.csv')

cols = ['source_id']

df_merged = messages_df_preprocessed.join(train.set_index(cols), on=cols, how='inner',)

### Setting up embeddings

In [11]:
import requests
import io

# response = requests.get('https://www.kaggleusercontent.com/kf/185976648/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..tFnHZ6jACUsANpVvtgdM0w.5aq0Xu5wE4w2_5hpNENsZdbZp14panev7X8zGhHaY2zXJlQZ4Bk9UeKwGKo7ZGiB23UGJsPeeeNl5ZiKbOI_ZxgrN7S3uU0ft9Gvo9dp4CQ6wDbNRvt3fiJuz12um8zxHoTOex2JHvnamhtZQVVIZz7uOF_bFnB4xhZmLsoB-5ZoYMQs5rszEg6h3UGppxvcylY7AixezN3WVLOlzZAHWBNH4IGV0Vdc5nXmjmDoLeF2IO2li3Yaa3Oc61W42uGi9K43l9kYqwrc5mmb_9Z3McDc1kU1p_GKaSV9ov9RS2MwohZ7fURQNxKBYECquENcpgBdF02Z779dZC2a3Gh2l6Sgwqlxj9rk_UsmyJNepD7HzdLHE7CGnG2KvcN5pOmIGpvHstUYx1GGGx98TUUcQ5XvGqDCjUMT_eQ1x8jWOXVpbGMSMeQl18gNnA-zpWq9O9tMwYxZVgJB_qJq0xytjVtwiRY1Dkaxuk8HGpproOJWa2nY5I4ahueg-FfVXlICCQjxBuJxA9IfDOgwj-kn6xXqUgzIYEpk18ThaR85QVMEYThRx-NNqTADwMSC0z2zIhvJSsgvO1sSW6crihhoMwzkkFqbjwPKseRn6DlJsrdUkrAd4nD8UF5R1BUNLyOzP-xnlpiF3POnMycXEg1k3g.WYoIhfa8f264yozX8U4lJg/embeddings[0,%2080000].npy')
# response.raise_for_status()
# data = np.load(io.BytesIO(response.content))  # Works!
# embeddings_0_80000 = np.load(io.BytesIO(response.content), allow_pickle=True)

embeddings_0_80000 = np.load('./embeddings/embeddings[0, 80000].npy', allow_pickle=True)
embeddings_80000_100000 = np.load('./embeddings/embeddings[80000, 100000].npy', allow_pickle=True)
embeddings_100000_200000 = np.load('./embeddings/embeddings[100000, 200000].npy', allow_pickle=True)
embeddings_200000_300000 = np.load('./embeddings/embeddings[200000, 300000].npy', allow_pickle=True)
embeddings_300000_359539 = np.load('./embeddings/embeddings[300000, 359539].npy', allow_pickle=True)

In [12]:
df_embeddings = [*embeddings_0_80000, *embeddings_80000_100000, *embeddings_100000_200000, *embeddings_200000_300000, *embeddings_300000_359539]

to_drop_idx = [i for i,v in enumerate(df_embeddings) if v == None]

In [13]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [14]:
df_embeddings_filtered = np.delete(np.asarray(df_embeddings, dtype=object), to_drop_idx)

In [15]:
messages_df_preprocessed['positional_idx'] = np.arange(0, len(messages_df_preprocessed), dtype=int)

In [16]:
messages_df_preprocessed_filtered = messages_df_preprocessed[~messages_df_preprocessed['positional_idx'].isin(to_drop_idx)]

### Embedding to dataframe conversion

In [17]:
# df_embeddings_filtered.reshape(357343, 1536)
df_embeddings_filtered_shaped = np.array([np.array(emb) for emb in df_embeddings_filtered])


In [22]:
pip install umap-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
import umap.umap_ as umap

umap_model = umap.UMAP(n_components=200, n_neighbors=30, min_dist=0.05)
umap_emb = umap_model.fit_transform(df_embeddings_filtered_shaped)

print(f"UMAP shape: {umap_emb.shape}")

In [51]:
from sklearn.decomposition import PCA

pca = PCA(n_components=200)
pca_result = pca.fit_transform(df_embeddings_filtered_shaped)
pca_result.shape[1]

200

In [52]:
embedding_df = pd.DataFrame(data=pca_result,
              index=messages_df_preprocessed_filtered['positional_idx'],
              columns=np.arange(0, pca_result.shape[1], dtype=int))

### Performing xgboost

In [22]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score


In [53]:
# messages_df_combined = pca_result

# messages_df_combined = pd.concat([messages_df_preprocessed_filtered, embedding_df], axis=1)
messages_df_combined = messages_df_preprocessed_filtered.join(embedding_df, on=embedding_df.index, how='inner',)
# messages_df_combined

In [54]:
train_merged = messages_df_combined.join(train.set_index(cols), on=cols, how='inner',)
train_merged._append(only_url)
train_merged.loc[:,'url_only'] = is_url
train_merged[np.arange(0, pca_result.shape[1], dtype=int)] = train_merged[np.arange(0, pca_result.shape[1], dtype=int)].fillna(0)
train_merged

Unnamed: 0,key_0,source_id,message_id,text,impressions,reactions,shares,comments,published_at,content_type,...,193,194,195,196,197,198,199,source_url,source_category,url_only
300,287,253091,1428333721,Порядка 150 КОИБов планирует использовать ЦИК ...,1979.0,16,2.0,0,1719396794000,POST,...,-0.024018,-0.007668,0.047855,-0.010914,-0.008679,-0.027585,0.029944,https://t.me/cikrf,AGGRESSIVE_INFORMATION,False
301,288,253091,1428079971,ЦИК утвердил порядок гашения избирательных бюл...,10291.0,57,58.0,0,1719396005000,POST,...,0.012351,0.009880,0.023404,-0.010884,-0.027824,0.003831,0.021761,https://t.me/cikrf,AGGRESSIVE_INFORMATION,False
302,289,253091,1428007536,Председатель ярославского избиркома Елена Нови...,2079.0,17,28.0,0,1719394221000,POST,...,0.006950,0.025981,0.072222,-0.022541,-0.035000,-0.017373,-0.033078,https://t.me/cikrf,AGGRESSIVE_INFORMATION,False
303,290,253091,1428079972,Мандат умершего Артура Чилингарова — депутата ...,2321.0,22,4.0,0,1719393577000,POST,...,-0.080406,-0.028902,0.003514,-0.014006,-0.011897,0.002467,0.024540,https://t.me/cikrf,AGGRESSIVE_INFORMATION,False
304,291,253091,1427996535,ЦИК обнаружил трех «экстремистов/террористов» ...,2573.0,51,23.0,0,1719390717000,POST,...,0.000281,-0.030960,0.064103,0.016541,0.049478,-0.009643,0.018859,https://t.me/cikrf,AGGRESSIVE_INFORMATION,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373306,359534,226841,776160744,Российские военные совместно с волонтерской ор...,512.0,0,0.0,0,1652353295000,POST,...,-0.013271,0.000420,-0.037522,0.013517,-0.038129,0.030705,0.047601,https://t.me/nosovka_ru,RESTRAINED_INFORMATION,False
373307,359535,226841,776160747,Прибывшие после 19 февраля в Россию граждане У...,507.0,0,0.0,0,1652341976000,REPOST,...,0.007180,-0.003072,0.027861,0.005283,-0.051387,0.017876,-0.011673,https://t.me/nosovka_ru,RESTRAINED_INFORMATION,False
373308,359536,226841,775473825,"За сутки Россия эвакуировала более 1,1 тысячи ...",484.0,0,0.0,0,1652290329000,REPOST,...,-0.026706,0.002720,0.001931,0.005776,-0.003541,-0.005124,-0.037670,https://t.me/nosovka_ru,RESTRAINED_INFORMATION,False
373309,359537,226841,775473830,Более 17 тысяч украинских беженцев пересекли р...,462.0,0,0.0,0,1652283583000,REPOST,...,-0.012664,0.031050,-0.004434,-0.016118,0.005291,0.045519,-0.002293,https://t.me/nosovka_ru,RESTRAINED_INFORMATION,False


In [148]:
# messages_df_combined[['positional_idx', 'message_id']].to_csv('positional_message_id', sep=',', index=False, encoding='utf-8')

In [55]:
train_merged[['positional_idx', 'message_id']]

Unnamed: 0,positional_idx,message_id
300,287,1428333721
301,288,1428079971
302,289,1428007536
303,290,1428079972
304,291,1427996535
...,...,...
373306,359534,776160744
373307,359535,776160747
373308,359536,775473825
373309,359537,775473830


In [72]:
# train_merged[[0, 1]]
# train_merged['source_category'].astype('category').cat.codes
# train_merged['source_category'].astype('category')
# y, labels = train_merged['source_category'].factorize()

(array([0, 0, 0, ..., 1, 1, 1]),
 Index(['AGGRESSIVE_INFORMATION', 'RESTRAINED_INFORMATION', 'SPAM',
        'UNRECOGNIZED_REPUBLICS', 'VIOLENCE', 'RESTRAINED_MILITARY',
        'AGGRESSIVE_MILITARY', 'PERSONAL_INFORMATION',
        'COORDINATION_OF_ATTACKS', 'ENTITIES_PROMOTING_VIOLENCE_AND_HATE',
        'SAFE_CONTENT', 'CYBER_ATTACK_COORDINATION'],
       dtype='object'))

In [107]:
from sklearn.ensemble import RandomForestClassifier
X = train_merged[[*np.arange(0, pca_result.shape[1], dtype=int), *['impressions', 'reactions', 'comments', 'shares', 'content_type', 'url_only']]]
X['content_type'] = X['content_type'].astype('category').cat.codes
X.columns = X.columns.astype(str)
# y = train_merged['source_category'].astype('category').cat.codes
y, labels = train_merged['source_category'].factorize()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# model = RandomForestClassifier(random_state=42, verbose=1, max_depth=150)
# Initialize the XGBoost classifier
model = XGBClassifier(tree_method="hist", device="cuda",
                      # booster='gblinear',
                      objective='softmax',
                      eval_metric='mlogloss',
                      # learning_rate=0.001,  
                      # colsample_bytree = 0.4,
                      # subsample = 0.8,
                      # n_estimators=3, 
                      # reg_alpha = 0.3,
                      # max_depth=100, 
                      # gamma=100
                     )

# Train the model
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print('Classification Report:')
print(report)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['content_type'] = X['content_type'].astype('category').cat.codes


In [43]:
train_merged[['source_id', '']]

array([253091, 253091, 253091, ..., 226841, 226841, 226841])

In [108]:
test = pd.read_csv('../ai-defence-summer-school-2024/test.csv')

cols = ['source_id']

test

Unnamed: 0,source_id,source_url
0,241835,https://t.me/ViktorVikarchuk
1,228094,https://t.me/economikal
2,101992513,https://t.me/nesaharru
3,282,https://t.me/cpartisans
4,2049875,https://twitter.com/PiftonnaBonba
...,...,...
1381,316,https://t.me/annashafran
1382,144255865,https://twitter.com/nikolaim63
1383,243355,https://t.me/er_molnia
1384,233106,https://www.facebook.com/mocten.sinum.9


In [109]:
test_merged = messages_df_combined.join(test.set_index(cols), on=cols, how='right')
test_merged._append(only_url)
test_merged.loc[:,'url_only'] = is_url
test_merged[np.arange(0, pca_result.shape[1], dtype=int)] = test_merged[np.arange(0, pca_result.shape[1], dtype=int)].fillna(0)
test_merged

Unnamed: 0,key_0,source_id,message_id,text,impressions,reactions,shares,comments,published_at,content_type,...,192,193,194,195,196,197,198,199,source_url,url_only
351981.0,339002.0,241835,1.396473e+09,"🆕""Жуліки"" на дорогах і у ""владі""!🆕\n\n——\n📞Віт...",2377.0,0.0,18.0,3.0,1.719153e+12,POST,...,-0.006961,0.027114,-0.003060,-0.051223,0.005967,0.037065,-0.017474,0.055385,https://t.me/ViktorVikarchuk,False
351982.0,339003.0,241835,1.306202e+09,"25000 гр - СТРАХ, який сковує здоровий глузд.....",2056.0,0.0,10.0,1.0,1.718310e+12,POST,...,0.028833,0.005451,-0.054906,-0.032732,-0.021472,0.027195,0.007458,0.054663,https://t.me/ViktorVikarchuk,False
351984.0,339004.0,241835,1.206078e+09,ГІПНОЗ чи затурканість українців? Шукайте ПРИЧ...,189.0,0.0,1.0,1.0,1.717317e+12,POST,...,-0.006630,0.054910,0.002464,-0.014516,-0.008274,0.059987,0.015244,0.068917,https://t.me/ViktorVikarchuk,False
351986.0,339005.0,241835,1.135469e+09,"25000 гр- штраф, чи нове ШОУ з ГРОМАДЯНАМИ УКР...",269.0,0.0,9.0,1.0,1.716717e+12,POST,...,0.013517,-0.000977,-0.028863,-0.100356,0.019591,0.014624,-0.001303,0.074140,https://t.me/ViktorVikarchuk,False
351988.0,339006.0,241835,1.052381e+09,"🆕18 ТРАВНЯ- ""кінець світу""!? 5200 холодильникі...",306.0,0.0,8.0,0.0,1.715867e+12,POST,...,0.023396,-0.026254,-0.001691,-0.024780,0.012875,0.036351,0.001079,0.069059,https://t.me/ViktorVikarchuk,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360827.0,347524.0,226879,1.365245e+09,В Херсоне паника в связи с эвакуацией и закрыт...,6786.0,125.0,56.0,0.0,1.718904e+12,POST,...,-0.004603,-0.009164,-0.002565,-0.024659,0.024085,-0.009301,-0.035202,-0.017836,https://t.me/VKhersone,False
360828.0,347525.0,226879,1.365012e+09,Противник работает над понижением порога ядерн...,6031.0,136.0,12.0,0.0,1.718901e+12,POST,...,0.001074,-0.035699,0.001464,0.016336,0.008742,0.007598,0.002372,0.008400,https://t.me/VKhersone,False
360829.0,347526.0,226879,1.364605e+09,Первое и окончательное слово в этом вопросе до...,5714.0,72.0,1.0,0.0,1.718895e+12,POST,...,0.014337,-0.046112,-0.036646,0.023646,0.021385,-0.003187,-0.009137,0.034506,https://t.me/VKhersone,False
360830.0,347527.0,226879,1.364520e+09,Херсонская область не нуждается в объединении ...,5807.0,134.0,5.0,0.0,1.718895e+12,POST,...,0.031831,0.013505,0.057655,-0.013585,0.043950,0.024264,-0.005616,-0.003383,https://t.me/VKhersone,False


In [110]:

model_ = XGBClassifier(tree_method="hist", device="cuda",
                      # booster='gblinear',
                      objective='softmax',
                      eval_metric='mlogloss',
                      # learning_rate=0.001,  
                      # colsample_bytree = 0.4,
                      # subsample = 0.8,
                      # n_estimators=3, 
                      # reg_alpha = 0.3,
                      # max_depth=100, 
                      # gamma=100
                     )

model_.fit(X, y)

In [114]:
X = test_merged[[*np.arange(0, pca_result.shape[1], dtype=int), *['impressions', 'reactions', 'comments', 'shares', 'content_type', 'url_only']]]
X['content_type'] = X['content_type'].astype('category').cat.codes
X['url_only'] = X['url_only'].astype('category').cat.codes

X.columns = X.columns.astype(str)

res = model_.predict(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['content_type'] = X['content_type'].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['url_only'] = X['url_only'].astype('category').cat.codes


In [122]:
df = pd.DataFrame()
df['source_id'] = test_merged['source_id']
df['source_category'] = labels[res]
# df['res'] = df[]
# df
# df['source_category'] = labels[res]
# # test['source_category'] = y_test_categories

category_counts = df.groupby('source_id')['source_category'].value_counts()

majority_categories = category_counts.groupby('source_id').idxmax().apply(lambda x: x[1]).reset_index(name='source_category')
print(f'Predicted source categories:\n{majority_categories.source_category.value_counts()}')
# majority_categories.to_csv('submission.csv',index=False)

Predicted source categories:
source_category
AGGRESSIVE_INFORMATION    1255
RESTRAINED_INFORMATION      94
SAFE_CONTENT                36
UNRECOGNIZED_REPUBLICS       1
Name: count, dtype: int64


In [None]:
X = test_merged[[*np.arange(0, pca_result.shape[1], dtype=int), *['impressions', 'reactions', 'comments', 'shares', 'content_type']]]
X['content_type'] = X['content_type'].astype('category').cat.codes

# y = test_merged['source_category'].astype('category').cat.codes


# y_pred = model.predict(X)
# f1_score(y, y_pred, average='weighted')