In [62]:
import re
import pandas as pd
import string
from langdetect import detect, DetectorFactory
from tqdm.notebook import tqdm

In [63]:
# Load dataset and inspect

df = pd.read_csv('../data/kyrgyz_coal_dataset_cleaned.csv')

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

df.head()

Dataset shape: (2924, 19)
Columns: ['post_url', 'post_text', 'post_label', 'number_of_likes', 'number_of_shares', 'number_of_comments', 'post_date', 'comment_text', 'comment_date', 'comment_reactions_count', 'comment_mode_types', 'post_target_region', 'post_language', 'comment_language', 'coal_topic/event_tag', 'season', 'comment_sentiment_label', 'sentiment_confidence', 'news_page_name']


Unnamed: 0,post_url,post_text,post_label,number_of_likes,number_of_shares,number_of_comments,post_date,comment_text,comment_date,comment_reactions_count,comment_mode_types,post_target_region,post_language,comment_language,coal_topic/event_tag,season,comment_sentiment_label,sentiment_confidence,news_page_name
0,https://www.facebook.com/AzattykPlus/posts/252...,"–≠–ª–µ–∫—Ç—Ä –∂–µ—Ç–∏—à—Å–∏–∑, –∫”©–º“Ø—Ä –∫—ã–º–±–∞—Ç - ”©–∫–º”©—Ç –∫–µ–ª—Å–∏–Ω",negative,0,0,0,2008-09-04,,,,,countrywide,Kyrgyz,,high price,fall,,,Azattyk
1,https://www.facebook.com/AzattykPlus/posts/283...,–¢“Æ–®–¢“Æ–ö–¢”® –ö”®–ú“Æ–†–î“Æ–ù –¢–û–ù–ù–ê–°–´ 7 –ú–ò“¢ –°–û–ú–ì–û –ß–´–ö–¢–´,negative,0,0,0,2008-09-14,,,,,Osh,Kyrgyz,,high price,fall,,,Azattyk
2,https://www.facebook.com/AzattykPlus/posts/214...,–ö—ã–∑—ã–ª-–ö—ã—è —à–∞–∞—Ä—ã–Ω–¥–∞–≥—ã —Ü–µ–º–µ–Ω—Ç –∑–∞–≤–æ–¥—É–Ω—É–Ω –∂–∞–Ω—ã–Ω–¥–∞ ...,negative,0,1,0,2011-10-11,,,,,Batken,Kyrgyz,,illegal coal mining,fall,,,Azattyk
3,https://www.facebook.com/AliToktakunov/posts/1...,–ö—ã–∑—ã–ª-–ö—ã—è —à–∞–∞—Ä—ã–Ω–¥–∞–≥—ã —Ü–µ–º–µ–Ω—Ç –∑–∞–≤–æ–¥—É–Ω—É–Ω –∂–∞–Ω—ã–Ω–¥–∞ ...,negative,1,0,1,2011-10-11,"–ë–∏–π–ª–∏–∫—Ç–µ–≥–∏–ª–µ—Ä –∫–∞—Ä–∞–±–∞—Å–∞ —É—à—É–Ω–¥–∞–π –±–æ–ª–æ—Ç—Ç–∞, –±–∞—Ä–¥—ã–∫...",2011-10-11,0.0,text,Batken,Kyrgyz,Kyrgyz,illegal coal mining,fall,negative,high,Azattyk
4,https://www.facebook.com/groups/araba.kg/perma...,–ö—ã—Ä–≥—ã–∑—Å—Ç–∞–Ω.–°–æ“£–∫—É –±–∏—Ä –∞–ø—Ç–∞ –∏—á–∏–Ω–¥–µ –ö—ã—Ä–≥—ã–∑—Å—Ç–∞–Ω–¥–∞ ...,negative,0,0,0,2011-08-23,,,,,countrywide,Kyrgyz,,high price,summer,,,araba.kg-avtobazar


## Data preparation

### 1. Cleaning Post and Comment Language

In [64]:
# Ensure consistent and reproducible language detection
DetectorFactory.seed = 0

# Define detection function
def detect_lang(text):
    try:
        return detect(str(text))
    except:
        return "unknown"

# Apply language detection to both post and comment texts
tqdm.pandas()

# Detect language in post_text
df['post_lang_detect'] = df['post_text'].progress_apply(detect_lang)

# Detect language in comment_text
df['comment_lang_detect'] = df['comment_text'].progress_apply(detect_lang)

# View results
print("Post text language distribution:")
print(df['post_lang_detect'].value_counts())

print("\nComment text language distribution:")
print(df['comment_lang_detect'].value_counts())

# Optionally show samples misclassified as Russian
print("\nExample of comments detected as Russian:")
display(df[df['comment_lang_detect'] == 'ru'][['comment_text']].sample(5, random_state=1))

print("\nExample of comments detected as Russian:")
display(df[df['post_lang_detect'] == 'ru'][['post_text']].sample(5, random_state=1))

print("\nExample of comments detected as English:")
display(df[df['comment_lang_detect'] == 'en'][['comment_text']].sample(5, random_state=1))

print("\nExample of comments detected as Unknown:")
display(df[df['comment_lang_detect'] == 'unknown'][['comment_text']].sample(5, random_state=1))

  0%|          | 0/2924 [00:00<?, ?it/s]

  0%|          | 0/2924 [00:00<?, ?it/s]

Post text language distribution:
post_lang_detect
ru    2896
mk      17
bg      11
Name: count, dtype: int64

Comment text language distribution:
comment_lang_detect
ru         2094
tl          599
mk           78
bg           66
en           39
unknown      25
uk           21
tr            2
Name: count, dtype: int64

Example of comments detected as Russian:


Unnamed: 0,comment_text
2428,"–ò—Ç, –∞–≤–∞–ª–∞–π(.—É—Ä–æ).–±–µ—Ä–µ—Ç –∫–µ—Ä–±–µ–Ω –∂—É—Ä–æ –±–µ—Ä–µ—Ç"
2467,"–∫–∞—Ä–∞–ø–∞–π—ã–º, —ç–ª –±–∏—Ä–∏ –±–∏—Ä–∏ –º–µ–Ω–µ–Ω –∫—ã—Ä—ã–ª—ã—à–±–∞—Å–∞ –±–æ–ª–¥—É"
2516,–ö–∞–π—Ä–∞–Ω –ö—ã—Ä–≥—ã–∑ –∫–∞–ª–∫—ã–º..—á–∞–Ω –∂—É—Ç–∫–∞–Ω –≥–∞–∑ –∂—É—Ç–∫–∞–Ω –∫–æ...
924,–≠—á –∫–∞–Ω–¥–∞–π –∫—Ä—ã—à–∞—Å—ã –∂–æ–∫ —ç–ª–µ –∏—à—Ç–µ—Ç–∏–ø –∂–∞—Ç—ã—à–∞—Ç –∂—É–º—É...
2716,–®–∞–π–ª–æ–æ–¥–æ–Ω –∫–∏–π–∏–Ω –∫–æ—Ä–æ–±—É–∑ —Ü–µ–Ω–∞–Ω—ã.—à–∞–π–ª–æ–æ —Ä–µ–∫–ª–∞–º–∞—Å—ã



Example of comments detected as Russian:


Unnamed: 0,post_text
2273,–ö”©–º“Ø—Ä! —á—ã–∫–∫–∞–Ω –ù–∞—Ä—ã–Ω–¥–∞ –æ—Ç—É–Ω —Ç–∞—Ä—Ç—ã—à –ö–∞—Ä–∞-–ö–µ—á–µ –∫–µ...
1411,"–ë—ã–π—ã–ª, ”©–ª–∫”©–¥”© –∫”©–º“Ø—Ä —Ç–∞—Ä—Ç—ã—à—Ç—ã–≥—ã –æ—Ä—É–Ω –∞–ª–±–∞–π–±—ã? –ë..."
1469,–û—à! –æ–±–ª—É—Å—É–Ω–¥–∞ 30 –∂–µ—Ä–≥–µ –∞—Ä–∑–∞–Ω–¥–∞—Ç—ã–ª–≥–∞–Ω –∫”©–º“Ø—Ä —Å–∞—Ç...
204,–ñ—É–º–≥–∞–ª–¥—ã–Ω –ê—Ä–∞–ª –∞–π–º–∞–≥—ã–Ω–¥–∞–≥—ã –ö–æ–∫–æ-–ú–µ—Ä–µ–Ω –¥–∞—Ä—ã—è—Å—ã–Ω...
2660,–ñ–£–ú–ì–ê–õ–î–ê–ì–´. –ú–ò“¢-–ö–£–® –ê–ô–´–õ–´–ù–´–ù –¢–£–†–ì–£–ù–î–ê–†–´ –ê–ô–ú–ê–ö–¢...



Example of comments detected as English:


Unnamed: 0,comment_text
458,thumb down
2393,"thumb, up"
507,yuqoridegilaga insof bersin ollohim ilohimü§≤ü§≤
1642,thumb up
2202,thumb! up



Example of comments detected as Unknown:


Unnamed: 0,comment_text
1614,:(
1606,"üëç,"
1758,:(
306,:(
2289,"üëç,"


In [65]:
# Filter for manually labeled Kyrgyz content
df_ky = df[
    (df['comment_language'] == 'Kyrgyz') |
    (df['post_language'] == 'Kyrgyz')
].copy()

# Kyrgyz posts
posts_df = df_ky[df_ky['post_language'] == 'Kyrgyz'][['post_text', 'post_label']].dropna()
posts_df = posts_df.rename(columns={'post_text': 'text', 'post_label': 'label'})
posts_df['source'] = 'post'

# Kyrgyz comments
comments_df = df_ky[df_ky['comment_language'] == 'Kyrgyz'][['comment_text', 'comment_sentiment_label']].dropna()
comments_df = comments_df.rename(columns={'comment_text': 'text', 'comment_sentiment_label': 'label'})
comments_df['source'] = 'comment'

# Show shapes to verify
print("Posts shape:", posts_df.shape)
print("Comments shape:", comments_df.shape)

# Only combine if there's data
combined_df = pd.concat([posts_df, comments_df], ignore_index=True)

print("Shape after Kyrgyz-only filtering:", combined_df.shape)
combined_df.sample(5) if not combined_df.empty else print("Combined DataFrame is empty.")

Posts shape: (2819, 3)
Comments shape: (2162, 3)
Shape after Kyrgyz-only filtering: (4981, 3)


Unnamed: 0,text,label,source
4245,–ê–ª–ª–∞. —Ä–∞–∞–∑—ã –±–æ–ª—Å—É–Ω —É—à—É–Ω–¥–∞–π –±–∞–ª–¥–∞—Ä –∫–æ–ø –±–æ–ª—Å—É–Ω –∞...,positive,comment
2649,–ö—ã—Ä–≥—ã–∑—Å—Ç–∞–Ω–¥–∞ —ç–Ω –∫—ã–º–±–∞—Ç –∫–µ–º—É—Ä –ë–∞—Ç–∫–µ–Ω –æ–±–ª—É—Å—É–Ω–¥–∞...,negative,post
2405,"–ê—Ç-–ë–∞—à—ã–¥–∞, –∫”©–º“Ø—Ä –∫”©–∑–¥”©–Ω —É—á—Ç—É –£—à—É–ª —Ç–∞–ø—Ç–∞ –ê—Ç-–ë–∞—à...",negative,post
262,–ö–∞—Ä–∞-–ö–µ—á–µ –∫”©–º“Ø—Ä –∫–µ–Ω–∏–Ω–∏–Ω –∫–æ–æ–ø—Å—É–∑–¥—É–∫ –∫—ã–∑–º–∞—Ç—ã–Ω—ã–Ω ...,negative,post
4178,–°—ã—Ä. —ç–º–µ—Å –≥–æ –º–µ–Ω –¥–µ–ª–µ –∞–π—ã–ª–¥–∞ –∂–∞—à–∞–π–º –∞–Ω–¥–∞–π –∫–∞—Ä–∞...,negative,comment


### 2. Removing Emojis, Punctuation and Stop Words

In [68]:
import re

# Step 1: Kyrgyz stopword list (100 words)
kyrgyz_stopwords = [
    "–º–µ–Ω", "—Å–µ–Ω", "–∞–ª", "–±–∏–∑", "—Å–∏–ª–µ—Ä", "–∞–ª–∞—Ä", "”©–∑“Ø", "–º–µ–Ω–∏–Ω", "—Å–µ–Ω–∏–Ω", "–∞–Ω—ã–Ω",
    "–±–∏–∑–¥–∏–Ω", "—Å–∏–ª–µ—Ä–¥–∏–Ω", "–∞–ª–∞—Ä–¥—ã–Ω", "–∫–∏–º", "—ç–º–Ω–µ", "–∫–∞—á–∞–Ω", "–∫–∞–Ω–¥–∞–π", "–∫–∞–Ω—á–∞", "–∫–∞–π–¥–∞", "—ç–º–Ω–µ–≥–µ",
    "–±—É–ª", "—É—à—É–ª", "–æ—à–æ–ª", "–∞–Ω–¥–∞–Ω", "–∞–Ω–¥–∞", "—É—à—É–Ω–¥–∞–π", "–æ—à–æ–Ω–¥–æ–π", "–æ—à–æ–Ω–¥–æ", "—É—à–æ–Ω–¥–æ", "—É—à—É–Ω—á–∞–ª—ã–∫",
    "–¥–∞", "–¥–µ", "–∂–µ", "–∂–∞–Ω–∞", "–¥–∞–≥—ã", "—ç–ª–µ", "—ç–º–∏", "–∞–Ω–∞–Ω", "–±–∏—Ä", "—ç–∫–∏",
    "“Ø—á", "—Ç”©—Ä—Ç", "–±–µ—à", "–æ–Ω", "–∂“Ø–∑", "–º—ã“£", "–∂—ã–ª", "–∫“Ø–Ω", "–∞–π", "—Å–∞–∞—Ç", "”©–∑", "—Ç–∏–≥–∏", 
    "–º–µ–Ω–µ–Ω", "“Ø—á“Ø–Ω", "–º–µ–Ω–¥–µ", "—Å–µ–Ω–¥–µ", "–∞–Ω–¥–∞", "–∞–Ω–¥–∞–≥—ã", "–º–µ–Ω–¥–µ", "—Å–∏–∑", "—Å–∏–∑–¥–µ—Ä", "—Å–∏–∑–¥–∏–Ω",
    "–±–æ–ª—É–ø", "–±–æ–ª–æ—Ç", "–±–æ–ª—Å–æ", "–±–æ–ª–≥–æ–Ω—É", "–±–æ–ª–±–æ–≥–æ–Ω", "—ç–∫–µ–Ω", "—ç–∫–µ–Ω–∏–Ω", "—ç–∫–µ–Ω—Å–∏“£", "—ç–∫–µ–Ω–±–∏–∑", "—ç–∫–µ–Ω—Å–∏“£–µ—Ä",
    "—ç–∫–µ–Ω—Å–∏–∑", "–∂–æ–∫", "–±–∞—Ä", "–∞—Ä", "–±–∏—Ä–æ–∫", "–∞–Ω—Ç–∫–µ–Ω–∏", "–æ—à–æ–Ω–¥—É–∫—Ç–∞–Ω", "—Ç–∞—Ä–∞–±—ã–Ω–∞–Ω", "—Ç–∞—Ä–∞–ø—Ç–∞–Ω",
    "—Ç—É—Ä–≥–∞–Ω", "–∂–∞—Ç–∫–∞–Ω", "–∂”©–Ω“Ø–Ω–¥”©", "–∂”©–Ω“Ø–Ω”©–Ω", "–±–µ—Ä", "–∞–ª–¥—ã", "–∞–ª–¥—ã–Ω–∞", "–∞—Ä–∫–∞—Å—ã–Ω–∞–Ω", "–∏–π–∏–Ω", "–∫–∏–π–∏–Ω", "–∞–π—Ä—ã–º",
    "–∫–∏–π–∏–Ω–∫–∏", "–±–∞—à–∫–∞", "–±–∏—Ä–∏", "—ç—á", "—ç—á –∫–∏–º", "–∂–æ–∫–∫–æ", "–∂–æ–∫—Ç—É", "–∫–∞—Ç–∞—Ä", "–∫—ã–ª–≥–∞–Ω", "—á–µ–π–∏–Ω", "”©–∫“Ø–ª–¥”©—Ä“Ø", 
    "–±“Ø–≥“Ø–Ω–∫“Ø", "–∫–∞—Ä–∞—Ç–∞", "–∞—Ç–∞—Ç", "–∫–∞–π—Å—ã", "–¥–µ–ø", "—Å–∏–∑–≥–µ", "–¥–µ–π—Ç", "–∞", "—É—á—É—Ä–¥–∞", "—ç—Ö", "–∞–π—Ç—ã–ø", "–º"
]

# Step 2: Define cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation/emojis
    text = re.sub(r'http\S+|www.\S+', '', text)  # remove URLs
    text = re.sub(r'\s+', ' ', text).strip()  # normalize whitespace

    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in kyrgyz_stopwords]
    return ' '.join(words)

# Step 3: Apply cleaning
combined_df['text_clean'] = combined_df['text'].apply(clean_text)

# Step 4: Remove very short texts (less than 5 words)
combined_df['word_count'] = combined_df['text_clean'].apply(lambda x: len(x.split()))
filtered_df = combined_df[combined_df['word_count'] >= 5].copy()

# Drop helper column
filtered_df.drop(columns=['word_count'], inplace=True)

# Final shape and preview
print("Final dataset shape after cleaning:", filtered_df.shape)
filtered_df.sample(15)

Final dataset shape after cleaning: (4484, 4)


Unnamed: 0,text,label,source,text_clean
2632,–ë–∏—à–∫–µ–∫: —á–µ–∫—Ç”©”© –º–µ–Ω–µ–Ω —Å–∞—Ç—ã–ª–≥–∞–Ω –∞—Ä–∑–∞–Ω –∫”©–º“Ø—Ä –°—É—É...,negative,post,–±–∏—à–∫–µ–∫ —á–µ–∫—Ç”©”© —Å–∞—Ç—ã–ª–≥–∞–Ω –∞—Ä–∑–∞–Ω –∫”©–º“Ø—Ä —Å—É—É–∫ —Ç“Ø—à–∫”©–Ω...
51,–ñ–æ–≥–æ—Ä–∫—É –ö–µ“£–µ—à –ñ–æ–≥–æ—Ä–∫—É —Å–æ—Ç—Ç—É–Ω –º–∏–ª–¥–µ—Ç–∏–Ω –∞—Ç–∫–∞—Ä—ã–ø ...,negative,post,–∂–æ–≥–æ—Ä–∫—É –∫–µ“£–µ—à –∂–æ–≥–æ—Ä–∫—É —Å–æ—Ç—Ç—É–Ω –º–∏–ª–¥–µ—Ç–∏–Ω –∞—Ç–∫–∞—Ä—ã–ø ...
2270,–ë–∏—à–∫–µ–∫–∫–µ! –∞—Ä–∑–∞–Ω –∫”©–º“Ø—Ä –∂–µ—Ç–∫–∏—Ä–∏–ª“Ø“Ø–¥”© ‚Äú–ö–∞—Ä–∞-–ö–µ—á–µ–¥...,positive,post,–±–∏—à–∫–µ–∫–∫–µ –∞—Ä–∑–∞–Ω –∫”©–º“Ø—Ä –∂–µ—Ç–∫–∏—Ä–∏–ª“Ø“Ø–¥”© –∫–∞—Ä–∞–∫–µ—á–µ–¥–µ–Ω ...
200,–ñ—É–º–≥–∞–ª —Ä–∞–π–æ–Ω—É–Ω–¥–∞ –∞–∑ –∫–∞–º—Å—ã–∑ —É–π-–±—É–ª–µ–ª–µ—Ä–≥–µ –∂–∞–ª–ø—ã ...,positive,post,–∂—É–º–≥–∞–ª —Ä–∞–π–æ–Ω—É–Ω–¥–∞ –∞–∑ –∫–∞–º—Å—ã–∑ —É–π–±—É–ª–µ–ª–µ—Ä–≥–µ –∂–∞–ª–ø—ã 1...
888,–ö–∞—Ä–∞-–ö–µ—á–µ–¥–µ–Ω –∫”©–º“Ø—Ä —Ç–∞—Ä—Ç–∫–∞–Ω –∂–µ–∫–µ –∏—à–∫–µ—Ä –∞–π–¥–æ–æ—á—É–ª...,negative,post,–∫–∞—Ä–∞–∫–µ—á–µ–¥–µ–Ω –∫”©–º“Ø—Ä —Ç–∞—Ä—Ç–∫–∞–Ω –∂–µ–∫–µ –∏—à–∫–µ—Ä –∞–π–¥–æ–æ—á—É–ª–∞...
2213,"–ö”©–º“Ø—Ä, —Ç–∞—à—ã–≥–∞–Ω –∞–π–¥–æ–æ—á—É–ª–∞—Ä –∏—à —Ç–∞—à—Ç–∞–¥—ã –ë–∏—à–∫–µ–∫ –∂—ã...",positive,post,–∫”©–º“Ø—Ä —Ç–∞—à—ã–≥–∞–Ω –∞–π–¥–æ–æ—á—É–ª–∞—Ä –∏—à —Ç–∞—à—Ç–∞–¥—ã –±–∏—à–∫–µ–∫ –∂—ã–ª...
2610,–ù–∞—Ä—ã–Ω–¥—ã–Ω. –ê–∫-–¢–∞–ª–∞–∞ —Ä–∞–π–æ–Ω—É–Ω–¥–∞ –∫”©–º“Ø—Ä–≥”© –±–∞–π–ª–∞–Ω—ã—à—Ç...,negative,post,–Ω–∞—Ä—ã–Ω–¥—ã–Ω –∞–∫—Ç–∞–ª–∞–∞ —Ä–∞–π–æ–Ω—É–Ω–¥–∞ –∫”©–º“Ø—Ä–≥”© –±–∞–π–ª–∞–Ω—ã—à—Ç—É—É...
3222,–ñ–µ–≥–∏—á—Ç–µ—Ä 2–¥—É–π–Ω–æ –∂–∞–∫—à—ã–ª—ã–∫ –∫–æ—Ä–±–æ–π –∫–∞–ª–≥—ã–ª–∞ —É–∫—É–º —Ç...,negative,comment,–∂–µ–≥–∏—á—Ç–µ—Ä 2–¥—É–π–Ω–æ –∂–∞–∫—à—ã–ª—ã–∫ –∫–æ—Ä–±–æ–π –∫–∞–ª–≥—ã–ª–∞ —É–∫—É–º —Ç...
3370,–ñ–∞–∫—à—ã –∏—à –∫—ã–ª—ã–ø –∞—Ç—ã–ø—Å—ã–Ω–∞—Ä –±–∏—Ä–æ–∫ –∫–æ–º—É—Ä–¥—É–Ω –±–∞–∞—Å—ã ...,positive,comment,–∂–∞–∫—à—ã –∏—à –∫—ã–ª—ã–ø –∞—Ç—ã–ø—Å—ã–Ω–∞—Ä –∫–æ–º—É—Ä–¥—É–Ω –±–∞–∞—Å—ã –∫–∞–Ω—á–∞–¥–∞–Ω
1376,–ê–ª–∞–π —Ä–∞–π–æ–Ω—É–Ω—É–Ω –°–æ–≥–æ–Ω–¥—É –∞–π—ã–ª—ã–Ω—ã–Ω —Ç—É—Ä–≥—É–Ω–¥–∞—Ä—ã –¢–∞...,negative,post,–∞–ª–∞–π —Ä–∞–π–æ–Ω—É–Ω—É–Ω —Å–æ–≥–æ–Ω–¥—É –∞–π—ã–ª—ã–Ω—ã–Ω —Ç—É—Ä–≥—É–Ω–¥–∞—Ä—ã —Ç–∞–π...


### 3. Tokenization and Light Stemming

In [83]:
# Step: Tokenization (whitespace-based)
filtered_df['tokens'] = filtered_df['text_clean'].apply(lambda x: x.split())

# Optional: Light suffix stripping for common Kyrgyz endings (pseudo-stemming)
def pseudo_stem(tokens):
    suffixes = [
    "–ª–∞—Ä", "–ª–µ—Ä", "–ª–æ—Ä", "–ª”©—Ä",
    "—Ç–∞—Ä", "—Ç–µ—Ä", "—Ç–æ—Ä", "—Ç”©—Ä",
    "–¥–∞—Ä", "–¥–µ—Ä", "–¥–æ—Ä", "–¥”©—Ä",  # plural
    "–¥—ã–Ω", "–¥–∏–Ω", "–¥—É–Ω", "–¥“Ø–Ω", 
    "—Ç–∞–Ω", "—Ç–µ–Ω", "–¥–∞–Ω", "–¥–µ–Ω",
    "–Ω–∞–Ω", "–Ω–µ–Ω", "—Ç–æ–Ω", "–¥–æ–Ω", # ablative case
    "–Ω—ã–Ω", "–Ω–∏–Ω", "–Ω—É–Ω", "–Ω“Ø–Ω",  # possessive
    "–≥–∞", "–≥–µ", "–∫–∞", "–∫–µ", "–≥–æ", "–Ω–∞", "–≥”©",     # dative case
    "–¥–∞", "–¥–µ", "—Ç–∞", "—Ç–µ", "–¥–æ", "—Ç–æ", "–¥”©",    # locative case
    "–¥—ã", "–¥–∏", "—Ç—É", "—Ç“Ø", "–¥“Ø",    # accusative
    "–Ω—ã", "–Ω–∏", "–Ω—É", "–Ω“Ø", "–¥—É",     # object
    "–ª—É—É", "–ª“Ø“Ø", "–ª—É", "–ª“Ø",    # derivation (e.g., –∂—É–º—É—à+—Ç—É—É)
    "—Å—ã–∑", "—Å“Ø–∑", "—Å—É–∑", "—Å“Ø",   # negative (e.g., –ø–∞–π–¥–∞—Å—ã–∑)
    "—á—ã–ª", "—á–∏–ª", "–∫—ã—á", "–≥“Ø—á",  # agentive, group
    "—á—ã–∫", "—á–µ–∫", "—á“Ø–∫", "–∫“Ø—á“Ø–∫",# diminutives/abstract
    "–º–∞–∫", "–º–µ–∫", "–ø—ã–∑",  "—á—ã", "–±–µ–π", "–≥–∞–Ω",   # verbal noun/infinity
    "–¥—ã–∫", "–¥–∏–∫", "—Ç—É–∫", "—Ç“Ø–∫",  # nominalization
    "—á—ã–ª—ã–∫", "—á–∏–ª–∏–∫", "–¥–∞–≥—ã",           # nominalizer
    "–ª”©—Ä–¥“Ø", "–¥–∞—Ä—ã", "—ã–ø", "–æ–ø", "–ø–µ–π"
               ]

    stemmed = []
    for word in tokens:
        for suf in suffixes:
            if word.endswith(suf) and len(word) > len(suf) + 2:
                word = word[: -len(suf)]
                break
        stemmed.append(word)
    return stemmed

# Apply optional stemmer
filtered_df['tokens_stemmed'] = filtered_df['tokens'].apply(pseudo_stem)

# Preview
filtered_df[['text_clean', 'tokens', 'tokens_stemmed']].sample(5)

Unnamed: 0,text_clean,tokens,tokens_stemmed
3552,–±–æ–ª–±–æ–π—Ç –±–µ–∫–µ—Ä —á—ã–∫–∫–∞–Ω –∫–æ–º—É—Ä–¥—É–Ω –Ω–µ—Å–∏ –∫—ã–º–±–∞—Ç –±–æ–ª—Å—É–Ω,"[–±–æ–ª–±–æ–π—Ç, –±–µ–∫–µ—Ä, —á—ã–∫–∫–∞–Ω, –∫–æ–º—É—Ä–¥—É–Ω, –Ω–µ—Å–∏, –∫—ã–º–±–∞...","[–±–æ–ª–±–æ–π—Ç, –±–µ–∫–µ—Ä, —á—ã–∫–∫–∞–Ω, –∫–æ–º—É—Ä, –Ω–µ—Å–∏, –∫—ã–º–±–∞—Ç, ..."
412,–∞–ª–∞–π —Ä–∞–π–æ–Ω—É–Ω—É–Ω —Å–æ–≥–æ–Ω–¥—É –∞–π—ã–ª—ã–Ω—ã–Ω —Ç—É—Ä–≥—É–Ω–¥–∞—Ä—ã —Ç–∞–π...,"[–∞–ª–∞–π, —Ä–∞–π–æ–Ω—É–Ω—É–Ω, —Å–æ–≥–æ–Ω–¥—É, –∞–π—ã–ª—ã–Ω—ã–Ω, —Ç—É—Ä–≥—É–Ω–¥–∞—Ä...","[–∞–ª–∞–π, —Ä–∞–π–æ–Ω—É, —Å–æ–≥–æ–Ω, –∞–π—ã–ª—ã, —Ç—É—Ä–≥—É–Ω, —Ç–∞–π–≥–∞–∫—Ç–∞—à..."
3052,—É—à—É —É—Å—Ç—É–¥–æ —Ç–æ–π–±–æ—Å —Ç–æ—Ä–¥—É–Ω –∞–π—ã–Ω–∞–Ω –µ–ª–±–∏–∑ –∞–∑–∞–ø —á–µ–∫—Ç–∏,"[—É—à—É, —É—Å—Ç—É–¥–æ, —Ç–æ–π–±–æ—Å, —Ç–æ—Ä–¥—É–Ω, –∞–π—ã–Ω–∞–Ω, –µ–ª–±–∏–∑, –∞...","[—É—à—É, —É—Å—Ç—É, —Ç–æ–π–±–æ—Å, —Ç–æ—Ä, –∞–π—ã, –µ–ª–±–∏–∑, –∞–∑–∞–ø, —á–µ–∫—Ç–∏]"
660,–∫–∞—Ä–∞ –∞–ª—Ç—ã–Ω–¥—ã–Ω ”©–º“Ø—Ä–¥“Ø —É—É—Ä–¥–∞–≥–∞–Ω —Ç“Ø–π—à“Ø–≥“Ø —Ç–∞—à–∫”©–º“Ø—Ä...,"[–∫–∞—Ä–∞, –∞–ª—Ç—ã–Ω–¥—ã–Ω, ”©–º“Ø—Ä–¥“Ø, —É—É—Ä–¥–∞–≥–∞–Ω, —Ç“Ø–π—à“Ø–≥“Ø, —Ç–∞...","[–∫–∞—Ä–∞, –∞–ª—Ç—ã–Ω, ”©–º“Ø—Ä, —É—É—Ä–¥–∞, —Ç“Ø–π—à“Ø–≥“Ø, —Ç–∞—à–∫”©–º“Ø—Ä, ..."
2706,–∂–æ–≥–æ—Ä–∫—É –∫–µ“£–µ—à –∂–æ–≥–æ—Ä–∫—É —Å–æ—Ç—Ç—É–Ω –º–∏–ª–¥–µ—Ç–∏–Ω –∞—Ç–∫–∞—Ä—ã–ø ...,"[–∂–æ–≥–æ—Ä–∫—É, –∫–µ“£–µ—à, –∂–æ–≥–æ—Ä–∫—É, —Å–æ—Ç—Ç—É–Ω, –º–∏–ª–¥–µ—Ç–∏–Ω, –∞—Ç...","[–∂–æ–≥–æ—Ä–∫—É, –∫–µ“£–µ—à, –∂–æ–≥–æ—Ä–∫—É, —Å–æ—Ç—Ç—É–Ω, –º–∏–ª–¥–µ—Ç–∏–Ω, –∞—Ç..."


### 4. Label Encoding

In [85]:
# Get unique labels
print("Unique sentiment labels:", filtered_df['label'].unique())

# Remove rows with unknown label
filtered_df = filtered_df[filtered_df['label'] != 'unknown'].copy()

# Map labels to integers
label_mapping = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

# Apply mapping
filtered_df['label_encoded'] = filtered_df['label'].map(label_mapping)

# Verify
filtered_df[['label', 'label_encoded']].sample(15)

Unique sentiment labels: ['negative' 'unknown' 'positive' 'neutral']


Unnamed: 0,label,label_encoded
1359,negative,0
2419,positive,2
3325,negative,0
3984,negative,0
3594,negative,0
1334,negative,0
3681,negative,0
4639,negative,0
1125,negative,0
4519,negative,0


In [86]:
#filtered_df.to_csv("../data/filtered_df_stemmed.csv", index=False)

### 5. Handling Class Imbalance

In [59]:
filtered_df['label_encoded'].value_counts(normalize=True)

label_encoded
0    0.592968
2    0.294118
1    0.112914
Name: proportion, dtype: float64

In [60]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced')

In [61]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

y = filtered_df['label_encoded']

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y),
    y=y
)

print(dict(enumerate(class_weights)))

{0: 0.5621436716077537, 1: 2.9520958083832336, 2: 1.1333333333333333}
