In [17]:
!pip install fasttext-langdetect tqdm swifter ipywidgets -q -U
!pip3 install ipywidgets==8.1.3 -q

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import requests
import ast
import re
import swifter
from nltk.corpus import stopwords
from wordcloud import WordCloud
from tqdm.auto import tqdm
from ftlangdetect import detect
from nltk import tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import scipy.sparse as sp
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [19]:
tqdm.pandas()

In [5]:
input_path = '/kaggle/input/ai-defence-summer-school-2024'
messages_filepath = input_path + '/messages/messages.jsonl'
train_filepath = input_path + '/train.csv'
test_filepath = input_path + '/test.csv'

In [6]:
pd.set_option('display.max_colwidth', None)

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Add stopwords

In [7]:
stopwords_ua_source_url = 'https://raw.githubusercontent.com/skupriienko/Ukrainian-Stopwords/master/stopwords_ua_list.txt'
stopwords_ru_source_url='https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt'
results_ua = requests.get(stopwords_ua_source_url)
results_ru = requests.get(stopwords_ru_source_url)
stopwords_en = stopwords.words('english')
stopwords_ua = ast.literal_eval(results_ua.text)
stopwords_ru = results_ru.text.split('\n')
stopwords_all=stopwords_ua+stopwords_ru+stopwords_en

In [8]:
with open('stopwords.txt', 'w+') as f:
    for word in stopwords_all:
        f.write(f"{word}\n")

### Read messages

In [9]:
messages_df = pd.read_json(messages_filepath, lines=True)
messages_df.content_type.value_counts()

content_type
POST       269719
REPOST     102153
COMMENT      1439
Name: count, dtype: int64

### Messages preprocessing

In [10]:
messages_df.fillna(0, inplace=True)

# Remove duplicates
messages_df.drop_duplicates(subset=['source_id', 'text'], inplace=True)

# Remove only urls
url_regex=re.compile(r"http[s]*\S+$")
is_url = messages_df.text.progress_apply(lambda x: bool(url_regex.match(x)))
messages_df.loc[:,'url_only'] = is_url

# Add elapsed_minutes column
time_elapsed = messages_df.groupby('source_id')['published_at'].agg(lambda x: (x.max() - x.min()).total_seconds() / 60).reset_index()
time_elapsed.columns = ['source_id', 'elapsed_minutes']
messages_df = pd.merge(messages_df, time_elapsed, on='source_id', how='left')

# Detect language
# messages_df_preprocessed['lang']=messages_df_preprocessed.text.progress_apply(lambda x: detect(x.replace("\n"," "))['lang'])
# top_k_lang = messages_df_preprocessed.lang.value_counts().nlargest(13).index
# messages_df_preprocessed.lang = messages_df_preprocessed.lang.progress_apply(lambda x: x if x in top_k_lang else 'other')

  0%|          | 0/367031 [00:00<?, ?it/s]

### Merge train with messages

In [11]:
train_df = pd.read_csv(train_filepath)
test_df = pd.read_csv(test_filepath)

train_full_df = pd.merge(messages_df, train_df, on='source_id', how='inner')
train_full_df.source_category.value_counts()

source_category
AGGRESSIVE_INFORMATION                  118757
RESTRAINED_INFORMATION                   90939
RESTRAINED_MILITARY                       9396
AGGRESSIVE_MILITARY                       8463
SAFE_CONTENT                              8172
ENTITIES_PROMOTING_VIOLENCE_AND_HATE      6379
UNRECOGNIZED_REPUBLICS                    5570
SPAM                                      4258
VIOLENCE                                  2681
COORDINATION_OF_ATTACKS                   1917
PERSONAL_INFORMATION                      1592
CYBER_ATTACK_COORDINATION                  687
Name: count, dtype: int64

### Text preprocessing

In [12]:
word_tokenizer = tokenize.RegexpTokenizer(r'\w+')

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags=re.UNICODE)

In [13]:
def collapse_dots(input):
    input = re.sub("\.+", ".", input)
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", input)
        all_collapsed = input == output
        input = output
    return output

def preprocess_hard(input):
    words_list = []
    result_str = ''
    for token in word_tokenizer.tokenize(input):
        if token.lower() not in stopwords_all:
            words_list.append(token)
        result_str = " ".join(words_list)
    return result_str


def process(input, light=False):
    if not isinstance(input, str):
        return input
    
    input = " ".join(tokenize.sent_tokenize(input))
    
    # Remove links
    input = re.sub(r"http\S+", "", input)
    # Remove new line tag
    input = re.sub(r"\n+", ". ", input)
    # Remove emojis
    input = re.sub(emoji_pattern, '', input)
    # Replace telegram link with dot
    input = re.sub(r"\bt\.me/\S+", "", input)
    # Remove sentence containing 'subscribe'
    # + подпишись
    #input = re.sub(r"\bПодпишитесь на\b.*?\.", '', input)
    # Remove underscore
    input = input.replace("_", "")
    # Remove symbol followed by dot
    for symb in ["!", ",", ":", ";", "?", "_"]:
        input = re.sub(rf"\{symb}\.", symb, input)
    # Remove hashtag
    input = re.sub(r"#\S+", "", input)
    # Remove user mentioning
    input = re.sub(r"@\S+", "", input)
    # Collapse dots
    input = collapse_dots(input)
    input = input.strip()
    
    if light:
        return input
    
    return preprocess_hard(input)

In [14]:
train_full_df['hard_text'] = train_full_df.text.swifter.apply(process)
train_full_df['light_text'] = train_full_df.text.swifter.apply(process, light=True)

Pandas Apply:   0%|          | 0/258811 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/258811 [00:00<?, ?it/s]

In [15]:
train_full_df.to_csv('train_preprocessed.csv')

### Wordclouds

In [1]:
# # Original
# plt.figure(figsize = (20,20)) 
# plt.title(f"Original text")
# wc = WordCloud(max_words = 500 , width = 800 , height = 400).generate(" ".join(train_preprocessed_df.head(10000).text))
# plt.imshow(wc, interpolation = 'bilinear')

# Hard preprocessing
plt.figure(figsize = (20,20)) 
plt.title(f"Hard processed")
wc = WordCloud(max_words = 500 , width = 800 , height = 400).generate(" ".join(train_full_df.hard_text))
plt.imshow(wc, interpolation = 'bilinear')

# # Light preprocessing
# plt.figure(figsize = (20,20)) 
# plt.title(f"Light processed")
# wc = WordCloud(max_words = 500 , width = 800 , height = 400).generate(" ".join(test_set.light_text))
# plt.imshow(wc, interpolation = 'bilinear')

NameError: name 'plt' is not defined

## Train

In [133]:
train_full_df['category_id'], category_names = train_full_df.source_category.factorize()
num_features=['impressions', 'reactions', 'shares', 'comments', 'elapsed_minutes']
categorical_features = ['content_type', 'url_only']
train_full_df['first_split'] = train_full_df.source_category.progress_apply(lambda x: x if x in ['AGGRESSIVE_INFORMATION', 'RESTRAINED_INFORMATION'] else 'OTHER')

Pandas Apply:   0%|          | 0/258811 [00:00<?, ?it/s]

In [134]:
train, val = train_test_split(train_full_df, test_size=0.2, stratify=train_full_df.source_category, random_state=42)

X_train = train[num_features + categorical_features + ['hard_text']]
X_val = val[num_features + categorical_features + ['hard_text']]

y_train = train['source_category']
y_val = val['source_category']

#### Vectorize text

In [None]:
vect = TfidfVectorizer(
    lowercase=True,
    max_features=10000,
    ngram_range = (1,2),
    min_df=5
)
vect.fit(X_first_train.hard_text)

In [None]:
train_txt_tkz = vect.transform(X_train.hard_text)
val_txt_tkz = vect.transform(X_val.hard_text)

#### Encode features

In [None]:
scaler = StandardScaler()
encoder = OneHotEncoder(drop='first',dtype=np.int64)

scaler.fit(X_train[num_features])
encoder.fit(X_train[categorical_features])

train_num = scaler.transform(X_train[num_features])
val_num = scaler.transform(X_val[num_features])

train_cat = encoder.transform(X_train[categorical_features])
val_cat = encoder.transform(X_val[categorical_features])

X_train_processed = sp.hstack((train_txt_tkz, train_num, train_cat), format='csr')
X_val_processed = sp.hstack((val_txt_tkz, val_num, val_cat), format='csr')

In [None]:
num_features = X_train[num_features].columns
categorical_features_new = encoder.get_feature_names_out(X_train[categorical_features].columns)
text_features = [f'text_{i}' for i in range(train_txt_tkz.shape[1])]

# Combine all column names
all_columns = np.concatenate((text_features, num_features, categorical_features_new))

# Convert sparse matrices to DataFrames
X_train_df = pd.DataFrame.sparse.from_spmatrix(X_train_processed, columns=all_columns)
X_val_df = pd.DataFrame.sparse.from_spmatrix(X_val_processed, columns=all_columns)

In [None]:
def plot_metrics(y_pred, y_val):
    print(f'Accuracy: {accuracy_score(y_val, y_pred) * 100:.4f}')
    print('\nClassification report')
    print(classification_report(y_val, y_pred))

In [None]:
counter = Counter(y_first_train)
agg_inf = counter['AGGRESSIVE_INFORMATION']
restrained_inf = counter['RESTRAINED_INFORMATION']
cyber_attack_count = counter['CYBER_ATTACK_COORDINATION']

#### Handle imbalanced data

In [None]:
rus = RandomUnderSampler(random_state=42, sampling_strategy={'AGGRESSIVE_INFORMATION': round(agg_inf * 0.3), 'RESTRAINED_INFORMATION': round(restrained_inf * 0.3)})
smote = SMOTE(sampling_strategy={'CYBER_ATTACK_COORDINATION': round(cyber_attack_count * 1.5)}, random_state=42)

X_train_resampled, y_train_resampled = rus.fit_resample(X_train_processed, y_first_train)
print(X_train_resampled.shape)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_resampled, y_train_resampled)

In [128]:
y_train_resampled.value_counts()

source_category
AGGRESSIVE_INFORMATION                  32064
RESTRAINED_INFORMATION                  24554
RESTRAINED_MILITARY                      8456
AGGRESSIVE_MILITARY                      7617
SAFE_CONTENT                             7355
ENTITIES_PROMOTING_VIOLENCE_AND_HATE     5741
UNRECOGNIZED_REPUBLICS                   5013
SPAM                                     3832
VIOLENCE                                 2413
COORDINATION_OF_ATTACKS                  1725
PERSONAL_INFORMATION                     1433
CYBER_ATTACK_COORDINATION                 927
Name: count, dtype: int64

#### Train Random Forest

In [129]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=10, max_depth=175)
rf_clf.fit(X_train_resampled, y_train_resampled)

In [130]:
y_first_pred = rf_clf.predict(X_val_processed)
plot_metrics(y_first_pred, y_first_val)

Accuracy: 59.2535

Classification report
                                      precision    recall  f1-score   support

              AGGRESSIVE_INFORMATION       0.55      0.91      0.69     11876
                 AGGRESSIVE_MILITARY       0.80      0.27      0.40       846
             COORDINATION_OF_ATTACKS       0.93      0.36      0.52       192
           CYBER_ATTACK_COORDINATION       0.96      0.67      0.79        69
ENTITIES_PROMOTING_VIOLENCE_AND_HATE       0.48      0.14      0.21       638
                PERSONAL_INFORMATION       0.84      0.24      0.37       159
              RESTRAINED_INFORMATION       0.74      0.30      0.43      9094
                 RESTRAINED_MILITARY       0.78      0.25      0.38       940
                        SAFE_CONTENT       0.67      0.91      0.77       817
                                SPAM       0.72      0.23      0.35       426
              UNRECOGNIZED_REPUBLICS       0.63      0.33      0.43       557
                      

## Test predict & submission

#### Add preprocessed text

In [82]:
test_full_df = pd.merge(messages_df, test_df, on='source_id', how='inner')
test_full_df['hard_text'] = test_full_df.text.swifter.apply(process)

Pandas Apply:   0%|          | 0/107484 [00:00<?, ?it/s]

In [83]:
test_full_df.to_csv('test_preprocessed.csv')

#### Preprocess other features

In [126]:
categorical_features = ['content_type', 'url_only']
test_text_tkz = vect.transform(test_full_df.hard_text)
test_num = scaler.transform(test_full_df[num_features])
test_cat = encoder.transform(test_full_df[categorical_features])
X_test_processed = sp.hstack((test_text_tkz, test_num, test_cat), format='csr')

In [94]:
def create_submission(df, y_pred, filename='submission'):
    df['source_category'] = y_pred
    category_counts = df.groupby('source_id')['source_category'].value_counts()
    majority_categories = category_counts.groupby(level=0).idxmax().apply(lambda x: x[1]).reset_index(name='source_category')
    print(f'Predicted source categories:\n{majority_categories.source_category.value_counts()}')
    majority_categories.to_csv(f'{filename}.csv',index=False)

### Predict & create submission

In [131]:
y_test_pred = rf_clf.predict(X_test_processed)

In [132]:
create_submission(test_full_df, y_test_pred, filename='sb_equal_rf')

Predicted source categories:
source_category
AGGRESSIVE_INFORMATION                  1257
RESTRAINED_INFORMATION                    66
SAFE_CONTENT                              48
ENTITIES_PROMOTING_VIOLENCE_AND_HATE       4
SPAM                                       3
UNRECOGNIZED_REPUBLICS                     3
AGGRESSIVE_MILITARY                        3
RESTRAINED_MILITARY                        2
Name: count, dtype: int64
