In [1]:
import json
import pandas as pd


lines = []
with open('../ai-defence-summer-school-2024/messages/messages.jsonl') as f:
    lines = f.read().splitlines()

line_dicts = [json.loads(line) for line in lines]
df = pd.DataFrame(line_dicts)

print(df)

        source_id  message_id  \
0          253122  1415474509   
1          253122  1408345528   
2          253122  1394795522   
3          253122  1364623985   
4          253122  1364694325   
...           ...         ...   
373306     226841   776160744   
373307     226841   776160747   
373308     226841   775473825   
373309     226841   775473830   
373310     226841   775473839   

                                                     text  impressions  \
0       https://www.youtube.com/watch?v=eR9FIPXffUw&li...        945.0   
1       Плохо спится в белую петербургскую ночь. Котор...       1429.0   
2       «Давайте посмотрим внимательно, какую свободу ...       1158.0   
3       К добрым словам Владимира Гельмана в мой адрес...       1071.0   
4       Известный политолог, профессор университета Хе...       1087.0   
...                                                   ...          ...   
373306  Российские военные совместно с волонтерской ор...        512.0   
373307  При

In [2]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import nltk
import requests
import ast
import re
from nltk.corpus import stopwords
# from wordcloud import WordCloud
from tqdm.auto import tqdm
# from ftlangdetect import detect
from nltk import tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df.columns

Index(['source_id', 'message_id', 'text', 'impressions', 'reactions', 'shares',
       'comments', 'published_at', 'content_type'],
      dtype='object')

In [4]:
tqdm.pandas()

In [5]:
stopwords_ua_source_url = 'https://raw.githubusercontent.com/skupriienko/Ukrainian-Stopwords/master/stopwords_ua_list.txt'
stopwords_ru_source_url='https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt'
results_ua = requests.get(stopwords_ua_source_url)
results_ru = requests.get(stopwords_ru_source_url)
stopwords_en = stopwords.words('english')
stopwords_ua = ast.literal_eval(results_ua.text)
stopwords_ru = results_ru.text.split('\n')
stopwords_all=stopwords_ua+stopwords_ru+stopwords_en

In [6]:
df.fillna(0, inplace=True)

# Remove comments
messages_df_preprocessed = df[
    (df.content_type != 'COMMENT') | (df.text=='')].drop_duplicates(subset=['source_id', 'text'])

# Remove only urls
url_regex=re.compile(r"http[s]*\S+$")
is_url = messages_df_preprocessed.text.progress_apply(lambda x: bool(url_regex.match(x)))
messages_df_preprocessed=messages_df_preprocessed[~is_url]


  0%|          | 0/365599 [00:00<?, ?it/s]

In [8]:
word_tokenizer = tokenize.RegexpTokenizer(r'\w+')

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags=re.UNICODE)

In [9]:
def collapse_dots(input):
    input = re.sub("\.+", ".", input)
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", input)
        all_collapsed = input == output
        input = output
    return output

def preprocess_hard(input):
    words_list = []
    for token in word_tokenizer.tokenize(input):
        if token.lower() not in stopwords_all:
            words_list.append(token)
        result_str = " ".join(words_list)
    return result_str


def process(input, light=True):
    if not isinstance(input, str):
        return input
    
    input = " ".join(tokenize.sent_tokenize(input))
    
    # Remove links
    input = re.sub(r"http[s]*\S+$", "", input)
    # Remove new line tag
    input = re.sub(r"\n+", ". ", input)
    # Remove emojis
    input = re.sub(emoji_pattern, '', input)
    # Replace telegram link with dot
    input = re.sub(r"\bt\.me/\S+", ".", input)
    # Remove sentence containing 'subscribe'
    input = re.sub(r"\bПодпишитесь на\b.*?\.", '', input)
    # Remove underscore
    input = input.replace("_", "")
    # Remove symbol followed by dot
    for symb in ["!", ",", ":", ";", "?", "_"]:
        input = re.sub(rf"\{symb}\.", symb, input)
    # Remove hashtag
    input = re.sub(r"#\S+", "", input)
    # Remove user mentioning
    input = re.sub(r"@\S+", "", input)
    # Collapse dots
    input = collapse_dots(input)
    input = input.strip()
    
    if light:
        return input
    
    return preprocess_hard(input)

In [10]:
messages_df_preprocessed['processed_text'] = messages_df_preprocessed['text'].progress_apply(process)

  0%|          | 0/359539 [00:00<?, ?it/s]

In [11]:
train = pd.read_csv('../ai-defence-summer-school-2024/train.csv')

cols = ['source_id']

df_merged = messages_df_preprocessed.join(train.set_index(cols), on=cols, how='inner',)

In [13]:
embeddings_0_80000 = np.load('./embeddings/embeddings[0, 80000].npy', allow_pickle=True)

In [84]:
import os
for dirname, _, filenames in os.walk('./embeddings'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./embeddings/embedding-fetcher.log
./embeddings/embeddings[0, 80000].npy
./embeddings/embeddings[0, 80000]_.npy
./embeddings/embeddings[200000, 300000].npy
./embeddings/embeddings[80000, 100000].npy
./embeddings/embeddings[100000, 200000].npy
./embeddings/embeddings[300000, 359539].npy


In [14]:
import requests
import io

# response = requests.get('https://www.kaggleusercontent.com/kf/185976648/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..tFnHZ6jACUsANpVvtgdM0w.5aq0Xu5wE4w2_5hpNENsZdbZp14panev7X8zGhHaY2zXJlQZ4Bk9UeKwGKo7ZGiB23UGJsPeeeNl5ZiKbOI_ZxgrN7S3uU0ft9Gvo9dp4CQ6wDbNRvt3fiJuz12um8zxHoTOex2JHvnamhtZQVVIZz7uOF_bFnB4xhZmLsoB-5ZoYMQs5rszEg6h3UGppxvcylY7AixezN3WVLOlzZAHWBNH4IGV0Vdc5nXmjmDoLeF2IO2li3Yaa3Oc61W42uGi9K43l9kYqwrc5mmb_9Z3McDc1kU1p_GKaSV9ov9RS2MwohZ7fURQNxKBYECquENcpgBdF02Z779dZC2a3Gh2l6Sgwqlxj9rk_UsmyJNepD7HzdLHE7CGnG2KvcN5pOmIGpvHstUYx1GGGx98TUUcQ5XvGqDCjUMT_eQ1x8jWOXVpbGMSMeQl18gNnA-zpWq9O9tMwYxZVgJB_qJq0xytjVtwiRY1Dkaxuk8HGpproOJWa2nY5I4ahueg-FfVXlICCQjxBuJxA9IfDOgwj-kn6xXqUgzIYEpk18ThaR85QVMEYThRx-NNqTADwMSC0z2zIhvJSsgvO1sSW6crihhoMwzkkFqbjwPKseRn6DlJsrdUkrAd4nD8UF5R1BUNLyOzP-xnlpiF3POnMycXEg1k3g.WYoIhfa8f264yozX8U4lJg/embeddings[0,%2080000].npy')
# response.raise_for_status()
# data = np.load(io.BytesIO(response.content))  # Works!
# embeddings_0_80000 = np.load(io.BytesIO(response.content), allow_pickle=True)

embeddings_0_80000 = np.load('./embeddings/embeddings[0, 80000].npy', allow_pickle=True)
embeddings_80000_100000 = np.load('./embeddings/embeddings[80000, 100000].npy', allow_pickle=True)
embeddings_100000_200000 = np.load('./embeddings/embeddings[100000, 200000].npy', allow_pickle=True)
embeddings_200000_300000 = np.load('./embeddings/embeddings[200000, 300000].npy', allow_pickle=True)
embeddings_300000_359539 = np.load('./embeddings/embeddings[300000, 359539].npy', allow_pickle=True)

In [15]:
df_embeddings = [*embeddings_0_80000, *embeddings_80000_100000, *embeddings_100000_200000, *embeddings_200000_300000, *embeddings_300000_359539]

to_drop_idx = [i for i,v in enumerate(df_embeddings) if v == None]

In [16]:
len(df_embeddings)

359539

In [17]:
len(embeddings_0_80000)

80000

In [18]:
len(to_drop_idx)

2196

In [60]:
# np.save('./embeddings/embeddings[0, 80000]_', np.asarray(embeddings_0_80000, dtype="object"))
#

In [19]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [20]:
# from numpy import loadtxt
# from xgboost import XGBClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

In [21]:
df_embeddings_filtered = np.delete(np.asarray(df_embeddings, dtype=object), to_drop_idx)

In [79]:
len(df_embeddings)

299539

In [22]:
len(np.asarray(df_embeddings, dtype=object))

359539

In [23]:
len(df_embeddings_filtered)

357343

In [59]:
# df_embeddings_filtered
# np.array(df_embeddings_filtered)
# pd.DataFrame(data=df_embeddings_filtered,    # values
#               index=data[1:,0],    # 1st column as index
#               columns=data[0,1:])

In [51]:
# train_filtered = train.drop(np.array(to_drop_idx),axis=1)
# train
# df_merged = df_final.join(train.set_index(cols), on=cols)
# df_embeddings_filtered
# df_filtered = messages_df_preprocessed.drop(np.array(to_drop_idx),axis=1)
# df
# messages_df_preprocessed[0:80000]
messages_df_preprocessed['positional_idx'] = np.arange(0, len(messages_df_preprocessed), dtype=int)

In [54]:
messages_df_preprocessed_filtered = messages_df_preprocessed[~messages_df_preprocessed['positional_idx'].isin(to_drop_idx)]

In [55]:
len(messages_df_preprocessed_filtered)

357343

### Embedding to dataframe conversion

In [73]:
# df_embeddings_filtered.reshape(357343, 1536)
df_embeddings_filtered_shaped = np.array([np.array(emb) for emb in df_embeddings_filtered])


In [79]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit_transform(df_embeddings_filtered_shaped).shape

(357343, 799)

In [None]:
# pd.DataFrame(data=df_embeddings_filtered,    # values
#               index=messages_df_preprocessed_filtered['positional_idx'],    # 1st column as index
#               columns=np.arange(0, len(df_embeddings_filtered), dtype=int))

In [None]:
# df_embeddings_filtered

### Performing xgboost