In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# Download necessary NLTK data
try:
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('taggers/averaged_perceptron_tagger_eng')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('averaged_perceptron_tagger_eng')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [8]:
import pandas as pd
import numpy as np

from pathlib import Path


BASE_PATH = Path('../../data/social-media-extremism-detection-challenge')
ori_train_df = pd.read_csv(BASE_PATH / 'train.csv')
BASE_LLM_PATH = BASE_PATH / 'results'
train_df = pd.read_csv(BASE_LLM_PATH / 'llm_oof_train.csv')
test_df = pd.read_csv(BASE_LLM_PATH / 'llm_test_preds.csv')



In [14]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # Default to Noun
def extract_root_words(text):
    # 1. Cleaning
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    
    # 2. Tokenize
    tokens = text.split()
    
    # 3. Filter Stopwords & Lemmatize (Merge derivatives)
    # We use a set() to ensure we only count UNIQUE words per comment (Step 1)
    root_words = set()
    for word in tokens:
        if word not in stop_words and len(word) > 2:
            # Step 4 happens here: 'killing' becomes 'kill'
            root = lemmatizer.lemmatize(word, pos='v') # 'v' focuses on verbs (actions)
            root_words.add(root)
            
    return list(root_words)
def extract_root_words_smart(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text) 
    tokens = text.split()
    
    # Tag the words (Is it a noun? A verb?)
    # returns list like [('kill', 'VB'), ('traitors', 'NNS')]
    tagged_tokens = nltk.pos_tag(tokens)
    
    unique_roots = set()
    
    for word, tag in tagged_tokens:
        if word not in stop_words and len(word) > 2:
            # Get the correct POS type (Noun/Verb/Adj)
            wntag = get_wordnet_pos(tag)
            
            # Lemmatize with the SPECIFIC tag
            # 'traitors' + NOUN -> 'traitor'
            # 'killing' + VERB -> 'kill'
            # 'worse' + ADJ -> 'bad'
            root = lemmatizer.lemmatize(word, pos=wntag)
            unique_roots.add(root)
            
    return list(unique_roots)

print("Extracting root words and merging derivatives...")
# Apply to your dataframe
train_df['unique_roots'] = train_df['Original_Message'].apply(extract_root_words)
test_df['unique_roots'] = test_df['Original_Message'].apply(extract_root_words)
print(train_df[['Original_Message', 'unique_roots']].head())

Extracting root words and merging derivatives...
                             Original_Message  \
0       sixth forms should burn to the ground   
1  whatever should burn benders to the ground   
2         factories should burn to the ground   
3   halal factories should burn to the ground   
4               nhs should burn to the ground   

                       unique_roots  
0        [grind, form, sixth, burn]  
1  [benders, whatever, burn, grind]  
2          [grind, burn, factories]  
3   [halal, grind, burn, factories]  
4                [nhs, burn, grind]  


In [None]:
# 1. Explode the list column
# This creates a new row for every single word in the list, preserving the label.
# It makes counting incredibly easy using pandas groupby.
exploded_df = train_df.explode('unique_roots')

# 2. Calculate Frequencies (Step 3)
# We group by Label (target) and Word (unique_roots) and count the rows
word_counts = exploded_df.groupby(['target', 'unique_roots']).size().reset_index(name='doc_count')

# 3. Pivot to compare Extremist vs Safe side-by-side
lexicon = word_counts.pivot(index='unique_roots', columns='target', values='doc_count').fillna(0)
lexicon.columns = ['Safe_Count', 'Extremist_Count']

# 4. Add Total Counts for Context
total_safe_docs = train_df['target'].value_counts()[0]
total_extremist_docs = train_df['target'].value_counts()[1]

print(f"Total Safe Docs: {total_safe_docs}")
print(f"Total Extremist Docs: {total_extremist_docs}")

Total Safe Docs: 1101
Total Extremist Docs: 1149


In [16]:
# Calculate P(Word | Class)
# "What % of extremist comments contain this word?"
lexicon['Safe_Prob'] = lexicon['Safe_Count'] / total_safe_docs
lexicon['Extremist_Prob'] = lexicon['Extremist_Count'] / total_extremist_docs

# Calculate the "Extremism Score" (Ratio)
# We add a tiny epsilon (1e-6) to avoid dividing by zero
lexicon['Extremism_Ratio'] = lexicon['Extremist_Prob'] / (lexicon['Safe_Prob'] + 1e-6)

# Filter for noise (Words that appear in at least 5 extremist docs)
robust_lexicon = lexicon[lexicon['Extremist_Count'] >= 5].copy()

# Sort by the most "Distinctively Extremist" words
top_extremist_words = robust_lexicon.sort_values(by='Extremism_Ratio', ascending=False)

print("--- Top Derived Roots for Extremism ---")
print(top_extremist_words[['Extremist_Count', 'Safe_Count', 'Extremism_Ratio']].head(20))

--- Top Derived Roots for Extremism ---
               Extremist_Count  Safe_Count  Extremism_Ratio
unique_roots                                               
anymore                   25.0         0.0     21758.050479
encourage                 20.0         0.0     17406.440383
purpose                   20.0         0.0     17406.440383
value                     19.0         0.0     16536.118364
adhere                    17.0         0.0     14795.474326
applaud                   14.0         0.0     12184.508268
rapists                   12.0         0.0     10443.864230
computer                   9.0         0.0      7832.898172
serve                      9.0         0.0      7832.898172
allah                      9.0         0.0      7832.898172
asians                     9.0         0.0      7832.898172
ugh                        8.0         0.0      6962.576153
thieve                     8.0         0.0      6962.576153
murderers                  8.0         0.0      6962.576153


In [None]:
# Convert lexicon to a dictionary for fast mapping
ratio_dict = top_extremist_words['Extremism_Ratio'].to_dict()
import json
with open('lexicon_weights.json', 'w') as f:
    json.dump(ratio_dict, f)
    
print("Saved 'lexicon_weights.json'")
def get_lexicon_score(root_list):
    score = 0
    for word in root_list:
        if word in ratio_dict:
            score += ratio_dict[word]
    return score


Saved 'lexicon_weights.json'


In [23]:
lexicon = word_counts.pivot(index='unique_roots', columns='target', values='doc_count').fillna(0)

# Rename columns (0 -> Safe_Count, 1 -> Extremist_Count)
lexicon.columns = ['Safe_Count', 'Extremist_Count']

# 2. CALCULATE TOTALS (for normalization)
# We need to know the total number of docs in each class to calculate probabilities
total_safe_docs = train_df['target'].value_counts()[0]
total_ext_docs = train_df['target'].value_counts()[1]

# 3. CALCULATE LOG-ODDS (With Laplace Smoothing +1)
# Smoothing prevents division by zero and infinite logs

# Probability of word given Extremist
p_w_ext = (lexicon['Extremist_Count'] + 1) / (total_ext_docs + 1)

# Probability of word given Safe
p_w_safe = (lexicon['Safe_Count'] + 1) / (total_safe_docs + 1)

# Log-Odds Ratio
# Positive Value (>0) = Strongly Associated with Extremism
# Negative Value (<0) = Strongly Associated with Safety
# Value near 0 = Neutral / Common Word
lexicon['log_odds'] = np.log(p_w_ext / p_w_safe)

# Convert to dictionary for fast lookup
log_odds_dict = lexicon['log_odds'].to_dict()

print("Top Extremist Words (Highest Log-Odds):")
print(lexicon.sort_values(by='log_odds', ascending=False).head(10))
def get_max_log_odds(root_list):
    if not isinstance(root_list, list) or len(root_list) == 0:
        return 0.0
    
    scores = []
    for word in root_list:
        if word in log_odds_dict:
            # We only care about Positive scores (Words indicating Extremism)
            # If a word is "Safe" (Negative score), we ignore it.
            val = log_odds_dict[word]
            if val > 0:
                scores.append(val)
    
    if not scores:
        return 0.0
    
    # Return the Single Highest Extremism Score found in the sentence
    return max(scores)



Top Extremist Words (Highest Log-Odds):
              Safe_Count  Extremist_Count  log_odds
unique_roots                                       
anymore              0.0             25.0  3.215461
purpose              0.0             20.0  3.001887
encourage            0.0             20.0  3.001887
value                0.0             19.0  2.953097
adhere               0.0             17.0  2.847737
applaud              0.0             14.0  2.665415
rapists              0.0             12.0  2.522314
serve                0.0              9.0  2.259950
computer             0.0              9.0  2.259950
allah                0.0              9.0  2.259950


In [24]:
# create a new dataframe which has only index and lexicon score
Lexicon_Score_train = train_df[['ID']].copy()
Lexicon_Score_train['lexicon_score'] = train_df['unique_roots'].apply(get_lexicon_score)
Lexicon_Score_train['lexicon_score_log'] = train_df['unique_roots'].apply(get_max_log_odds)
Lexicon_Score_test = test_df[['ID']].copy()
Lexicon_Score_test['lexicon_score'] = test_df['unique_roots'].apply(get_lexicon_score)
Lexicon_Score_test['lexicon_score_log'] = test_df['unique_roots'].apply(get_max_log_odds)
#save to csv
Lexicon_Score_train.to_csv('train_with_lexicon_verb.csv', index=False)
Lexicon_Score_test.to_csv('test_with_lexicon_verb.csv', index=False)