In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
directory = '/kaggle/input/bangla-largest-newspaper-dataset/'
files = [file for file in os.listdir(directory) if file.endswith('.json')]

dfs = []

for file in files:
    df = pd.read_json(os.path.join(directory, file))
    dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)
combined_df.shape

In [None]:
df = combined_df

In [None]:
df.head()

In [None]:
df = df[['category', 'body']]
df

In [None]:
df['category'].value_counts()

In [None]:
bangla_news_articles = df["body"].values.tolist()
len(bangla_news_articles)

In [None]:
ethnic_tribe_names = [
    "চাকমা", "মারমা", "সাঁওতাল", "ত্রিপুরা", "গারো", "ওঁরাও", "তঞ্চ্যঙ্গা", "ম্রো", 
    "পাংখো", "চাক", "খেয়াং", "খুমি", "লুসাই","কুকি", "রাখাইন", "মণিপুরী",
    "হাজং", "খাসিয়া", "মং", "বর্মন", "পাহাড়ি", "মালপাহাড়ি", "মুন্ডা", "ভূমিজ",
    "কন্দ", "পাঙন", "লাওরা", "মুরং", "বাগদী"
] #"বম","কোচ","ডালু","কোল", "রাজবংশী", "পাত্র", "ভিল", "গণ্ড", "খাসি"

ethnicity_directed_words = [
    "আদিবাসী" , "আদিবাসি" , "উপজাতি", "নৃগোষ্ঠী"
]
ethnic_dictionary = ethnic_tribe_names + ethnicity_directed_words

In [None]:
print(bangla_news_articles[0])

In [None]:
ethnic_dataset = []
nonethnic_dataset = []
sus_entries = []


for news_article in bangla_news_articles:
    try:
        if isinstance(news_article, list):
            for article in news_article:
                ethnic_word_frequency_in_article = sum([1 for ethnic_word in ethnic_dictionary if ethnic_word in news_article])
                if ethnic_word_frequency_in_article>1: ethnic_dataset.append(article)
                else: nonethnic_dataset.append(article)
        else:
            ethnic_word_frequency_in_article = sum([1 for ethnic_word in ethnic_dictionary if ethnic_word in news_article])
            if ethnic_word_frequency_in_article>1: ethnic_dataset.append(news_article)
            else: nonethnic_dataset.append(news_article)
    except:
        sus_entries.append(news_article) 

print(f"Ethnic dataset length = {len(ethnic_dataset)}")
print(f"Non-ethnic dataset length = {len(nonethnic_dataset)}")
print(f"Sus dataset length = {len(sus_entries)}")

In [None]:
to_remove = ['email\xa0protected', '\n\n\n\xa0\n\n\n\n\n', '\u200c্', '\n\n', '\xa0', '\n']
cleaned_ethnic_dataset = []
for article in ethnic_dataset:
    for noise in to_remove:
        article = article.replace(noise, '')
    cleaned_ethnic_dataset.append(article)
len(cleaned_ethnic_dataset)

In [None]:
import joblib
joblib.dump(cleaned_ethnic_dataset, "ethnic_dataset.joblib")

In [None]:
from IPython.display import FileLink
FileLink(r'ethnic_dataset.joblib')

In [None]:
len(cleaned_ethnic_dataset)

## Resources
    1. https://www.kaggle.com/code/iamsdt/bengali-fake-news-classification-using-bert
    2. https://huggingface.co/csebuetnlp/banglabert
    3. https://github.com/omar-sharif03/NAACL-SRW-2021/blob/main/Code%20Snippets/ML_Algorithms_NAACL_(Emotion_Data).ipynb
    4. https://github.com/aljubaer/Topic-Modeling-in-Bengali/blob/master/code/lda-visualizer.ipynb