In [1]:
import pandas as pd
import numpy as np
import joblib 
import nltk
import re
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from pprint import pprint

In [2]:
!ls /kaggle/input/news-articles-with-ethnic-words

ethnic_dataset_cleaned.joblib


In [3]:
ethnic_dataset = joblib.load("/kaggle/input/news-articles-with-ethnic-words/ethnic_dataset_cleaned.joblib")

In [5]:
len(ethnic_dataset)

10187

In [6]:
ethnic_tribe_names = [
    "চাকমা", "মারমা", "সাঁওতাল", "ত্রিপুরা", "গারো", "ওঁরাও", "তঞ্চ্যঙ্গা", "ম্রো", 
    "পাংখো", "চাক", "খেয়াং", "খুমি", "লুসাই","কুকি", "রাখাইন", "মণিপুরী",
    "হাজং", "খাসিয়া", "মং", "বর্মন", "পাহাড়ি", "মালপাহাড়ি", "মুন্ডা", "ভূমিজ",
    "কন্দ", "পাঙন", "লাওরা", "মুরং", "বাগদী"
] #"বম","কোচ","ডালু","কোল", "রাজবংশী", "পাত্র", "ভিল", "গণ্ড", "খাসি"

ethnicity_directed_words = [
    "আদিবাসী" , "আদিবাসি" , "উপজাতি", "নৃগোষ্ঠী"
]
ethnic_dictionary = ethnic_tribe_names + ethnicity_directed_words

In [9]:
relevant_articles = []
THRESHOLD = 5
list_count = 0
for article in ethnic_dataset:
    if isinstance(article, str):
        ethnic_word_count = sum([1 for ethnic_word in ethnic_dictionary if ethnic_word in article])
        if ethnic_word_count > THRESHOLD:
            relevant_articles.append(article)
    else:
        list_count +=1 
print(f"Total Ethnic Article: {len(ethnic_dataset)}\nRelevant Ethnic Article: {len(relevant_articles)}\nOthers: {list_count}")

Total Ethnic Article: 10187
Relevant Ethnic Article: 221
Others: 0


### Bangla Topic Modeling
https://github.com/aljubaer/Topic-Modeling-in-Bengali/blob/master/code/data-preprocessor.ipynb

In [18]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import nltk
import json
import os

In [19]:
def valid_bengali_letters(char):
    return ord(char) >= 2433 and ord(char) <= 2543 

def get_replacement(char):
    if valid_bengali_letters(char):
        return char
    newlines = [10, 2404, 2405, 2551, 9576]
    if ord(char) in newlines: 
        return ' '
    return ' ';

def get_valid_lines(line):
    copy_line = ''
    for letter in line:
        copy_line += get_replacement(letter)
    return copy_line

def sent_to_words(sentences):
    for sentence in sentences:
        yield(nltk.word_tokenize(get_valid_lines(sentence)))  # deacc=True removes punctuations

In [20]:
!wget https://github.com/aljubaer/Topic-Modeling-in-Bengali/blob/master/code/stop_words.txt

--2024-04-06 07:21:15--  https://github.com/aljubaer/Topic-Modeling-in-Bengali/blob/master/code/stop_words.txt
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: 'stop_words.txt'

stop_words.txt          [ <=>                ] 153.78K  --.-KB/s    in 0.03s   

2024-04-06 07:21:15 (4.58 MB/s) - 'stop_words.txt' saved [157470]



In [25]:
stopwords_file = open('stop_words.txt', "r+", encoding = 'utf-8')
all_stopwords = stopwords_file.read()
stopwords_ready = [word.strip() for word in all_stopwords.split()]
def convertToDataFrame(data_json):
    df = pd.DataFrame(data_json)
    print('DataFrame shape' + str(df.shape))
    return df

def remove_stopwords(content):   
    without_stopwords = []
    for word in content:
        if word not in stopwords_ready and len(word) > 5:
            without_stopwords.append(word)
    return without_stopwords
def remove_stopwords_list(data_list):
    data_without_stopwords_list = []
    for content in data_list:
        data_without_stopwords_list.append(remove_stopwords(content))
    return data_without_stopwords_list
def runLda(data_ready, num_topics = 10, iterations = 1000, alpha='auto'):
    # Create Dictionary
    id2word = corpora.Dictionary(data_ready)

    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data_ready]

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_topics, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=30,
                                               passes=30,
                                               alpha=alpha,
                                               iterations=iterations,
                                               per_word_topics=True)
    return lda_model
def ldaOutputProducer(lda_model):
    x = (lda_model.show_topics(num_topics=20, num_words=40,formatted=False))
    topics_words = [(tp[0], [wd[0] for wd in tp[1]], [wd[1] for wd in tp[1]]) for tp in x]
    output_json_list = []
    for topic,words,conts in topics_words:
        topic_json = {}
        topic_content = {}
        topic_content["words"] = words
        topic_content["conts"] = conts
        #topic_json[str(topic)] = topic_content
        output_json_list.append(topic_content)
    out_df = convertToDataFrame(output_json_list)
    out_df.to_json(r'topic_dist.txt')

In [28]:
data_tokenized_list = list(sent_to_words(relevant_articles))
data_without_stopwords_list = remove_stopwords_list(data_tokenized_list)
#data_stemmed_list = stemming_data_list(data_without_stopwords_list)
lda_model = runLda(data_without_stopwords_list, num_topics = 10, iterations = 1000)
for topic_id, topic_words in lda_model.print_topics():
    print(f"Topic ID: {topic_id}, Words: {topic_words}")
ldaOutputProducer(lda_model)

Topic ID: 0, Words: 0.030*"পার্বত্য" + 0.014*"চট্টগ্রাম" + 0.011*"উপজেলার" + 0.010*"ত্রিপুরা" + 0.009*"প্রাথমিক" + 0.009*"শিশুদের" + 0.008*"বিভিন্ন" + 0.007*"উন্নয়ন" + 0.007*"পাঠ্যপুস্তক" + 0.007*"চেয়ারম্যান"
Topic ID: 1, Words: 0.033*"তঞ্চঙ্গ্যা" + 0.023*"সম্প্রদায়" + 0.015*"উৎসবকে" + 0.011*"তরুণীরা" + 0.011*"পাহাড়ের" + 0.008*"পুরাতন" + 0.007*"নেওয়ার" + 0.006*"লকডাউন" + 0.006*"বান্দরবানে" + 0.005*"সাঙ্গু"
Topic ID: 2, Words: 0.021*"আদিবাসী" + 0.016*"পরিষদের" + 0.016*"আওয়ামী" + 0.015*"সভাপতি" + 0.013*"সম্পাদক" + 0.010*"বক্তব্য" + 0.009*"পার্বত্য" + 0.009*"আদিবাসীদের" + 0.008*"সাধারণ" + 0.008*"চেয়ারম্যান"
Topic ID: 3, Words: 0.018*"বৈসাবি" + 0.016*"সাংগ্রাই" + 0.015*"সম্প্রদায়ের" + 0.014*"পাহাড়ি" + 0.011*"ঐতিহ্যবাহী" + 0.011*"এপ্রিল" + 0.010*"অনুষ্ঠান" + 0.009*"উৎসবের" + 0.009*"অনুষ্ঠিত" + 0.009*"প্রধান"
Topic ID: 4, Words: 0.000*"জন্মদিনকে" + 0.000*"ছোটাছুটি" + 0.000*"নালন্দার" + 0.000*"দলবদ্ধ" + 0.000*"দলগতভাবে" + 0.000*"নৃত্যকলা" + 0.000*"জন্মদিন" + 0.000*"গ্যালারির" + 0.000*"চতুর্থত

{"words":{"0":["\u09aa\u09be\u09b0\u09cd\u09ac\u09a4\u09cd\u09af","\u099a\u099f\u09cd\u099f\u0997\u09cd\u09b0\u09be\u09ae","\u0989\u09aa\u099c\u09c7\u09b2\u09be\u09b0","\u09a4\u09cd\u09b0\u09bf\u09aa\u09c1\u09b0\u09be","\u09aa\u09cd\u09b0\u09be\u09a5\u09ae\u09bf\u0995","\u09b6\u09bf\u09b6\u09c1\u09a6\u09c7\u09b0","\u09ac\u09bf\u09ad\u09bf\u09a8\u09cd\u09a8","\u0989\u09a8\u09cd\u09a8\u09df\u09a8","\u09aa\u09be\u09a0\u09cd\u09af\u09aa\u09c1\u09b8\u09cd\u09a4\u0995","\u099a\u09c7\u09df\u09be\u09b0\u09ae\u09cd\u09af\u09be\u09a8","\u09ae\u09be\u09a4\u09c3\u09ad\u09be\u09b7\u09be\u09df","\u0995\u09be\u09b0\u09cd\u09af\u0995\u09cd\u09b0\u09ae","\u098f\u09b2\u09be\u0995\u09be\u09df","\u09b0\u09be\u0999\u09cd\u0997\u09be\u09ae\u09be\u099f\u09bf","\u09b6\u09bf\u0995\u09cd\u09b7\u09be\u09b0\u09cd\u09a5\u09c0\u09a6\u09c7\u09b0","\u09ac\u09be\u09a8\u09cd\u09a6\u09b0\u09ac\u09be\u09a8","\u09b6\u09bf\u0995\u09cd\u09b7\u09be\u09b0\u09cd\u09a5\u09c0\u09b0\u09be","\u09ae\u09be\u09a7\u09cd\u09af\u09ae\u0

### Inspect the dataset
 - If `চাকমা`, `ত্রিপুরা` is in someone's name
     - `কোচ` is a tribe name. So many of our articles are on sports.
     -  We are removing `"বম","কোচ","ডালু","কোল", "রাজবংশী", "পাত্র", "ভিল", "গণ্ড", "খাসি"` from our list of ethnic words since they may mean many things aside from a tribe name.
 - Removing `মুসলিম` from `পাঙন মুসলিম` since it is picking religious articles.
 - We have this noise everywhere.`\n\n\xa0\n\n` , `\u200c্`
 - There are some arrays. We need to parse them.
### Questions
 - Why are there`রোহিঙ্গা` articles in our ethnic dataset?
 - How do we determine which articles are actually on ethnic people? In many articles, the perpetrators of a crime just happen to be a `চাকমা`. Those articles are not about ethnic people!