# Install Necessary Libraries

In [None]:
%%capture
!pip install translators
!pip install git+https://github.com/csebuetnlp/normalizer

# Import Libraries

In [None]:
import re
import pandas as pd
from normalizer import normalize
from tqdm.auto import tqdm
import translators as ts

Using region District of Columbia server backend.



In [None]:
tqdm.pandas()

# Load the Dataset

In [None]:
dataset_file_location = '/content/drive/MyDrive/B.Tech. Final Year Project/Offensive Language Detection/Datasets/m_dataset_21_9/merged_dataset_21_9_arnab.csv'
df = pd.read_csv(dataset_file_location)
df.count()

text    90500
hate    90501
dtype: int64

In [None]:
df['hate'].value_counts()

0    52089
1    38412
Name: hate, dtype: int64

# Data Pre-processing

## Drop rows with missing values in the text column

In [None]:
df = df.dropna(subset=["text"])

## Load Bengali Numbers

In [None]:
with open('/content/drive/MyDrive/B.Tech. Final Year Project/Offensive Language Detection/Preprocess Files/numbers_bn.txt', 'r', encoding='utf-8') as f:
    num_bn = f.read()
    num_bn = [char for char in num_bn]
print(num_bn)

['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯']


## Load Bengali Punctuations

In [None]:
with open('/content/drive/MyDrive/B.Tech. Final Year Project/Offensive Language Detection/Preprocess Files/own_punctuations-bn.txt', 'r', encoding='utf-8') as f:
    punctuations = f.read()
    punctuations = [char for char in punctuations]
print('Number of Punctuations: ', len(punctuations))
print(punctuations)

Number of Punctuations:  37
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '।', '‘', '’', '“', '”']


## Remove Censored Comments
Some comments have multiple stars(```*```) to hide abusive words. Those comments are removed.

In [None]:
df_clean = df[~df['text'].str.contains(r'\*{2,}', regex=True)].copy()

## Remove Special Words

In [None]:
special_words = ['<br />', ':D', ':P', ':v']

def remove_special_wrods(string):
    for word in special_words:
        string = string.replace(word, ' ')
    return string

In [None]:
df_clean['text'] = df_clean['text'].progress_apply(remove_special_wrods)

  0%|          | 0/90392 [00:00<?, ?it/s]

## Remove Punctuations

In [None]:
df_clean['text'] = df_clean['text'].progress_apply(lambda x: ''.join([char for char in x if char not in punctuations]))

  0%|          | 0/90392 [00:00<?, ?it/s]

## Remove Links and Emojis
[Normalizer](https://github.com/csebuetnlp/normalizer) is used.

In [None]:
df_clean['text'] = df_clean['text'].progress_apply(lambda x: normalize(x, url_replacement='', emoji_replacement=''))

  0%|          | 0/90392 [00:00<?, ?it/s]

## Remove Numbers

### Remove English Numbers

In [None]:
df_clean['text'] = df_clean['text'].progress_apply(lambda x: ''.join([char for char in x if not char.isdigit()]))

  0%|          | 0/90392 [00:00<?, ?it/s]

### Remove Bengali Numbers

In [None]:
df_clean['text'] = df_clean['text'].progress_apply(lambda x: ''.join([char for char in x if char not in num_bn]))

  0%|          | 0/90392 [00:00<?, ?it/s]

## Remove Single-Letter Words

In [None]:
df_clean['text'] = df_clean['text'].progress_apply(lambda x: ' '.join([word for word in x.split() if len(word) > 1]))

  0%|          | 0/90392 [00:00<?, ?it/s]

## Translate English words to Bengali

If the below code gives an error like:<br />
**```KeyError: 0```**<br />

Then run the code until and unless the error is gone.<br />
This can happen due to some problem in the translator.


In [None]:
iter = 0
num_of_eng_words = {0: 0}

In [None]:
while True:
    iter += 1
    count = 0
    print("Iteration:", iter)
    for row in tqdm(df_clean.itertuples(), total=df_clean.count()['text'], desc='Translating English Words'):
        temp_text = row.text
        for word in temp_text.split():
            if re.match(r'^[a-zA-Z]+$', word):
                count += 1
                word_to_bn = ts.translate_text(word, translator='google', from_language='en', to_language='bn')
                if word_to_bn == word:
                    word_to_bn = ''
                temp_text = temp_text.replace(word, word_to_bn)
                df_clean.loc[row.Index, 'text'] = temp_text
    if num_of_eng_words[iter - 1] == count:
        print(count, " sentences can not be translated.")
        break
    else:
        num_of_eng_words[iter] = count

if 0 in num_of_eng_words.keys():
    del num_of_eng_words[0]
print("Number of English Words found in the Dataset:")
for iter, count in num_of_eng_words.items():
    print(f'Interation {iter}: {count}')

Iteration: 11


Translating English Words:   0%|          | 0/90346 [00:00<?, ?it/s]

KeyError: ignored

In [None]:
for row in tqdm(df_clean.itertuples(), total=df_clean.count()['text'], desc='Translating English Words'):
    temp_text = row.text
    for word in temp_text.split():
        if re.match(r'^[a-zA-Z]+$', word):
            temp_text = temp_text.replace(word, '')
            df_clean.loc[row.Index, 'text'] = temp_text
            # print(row.Index)
            # print(word)

Translating English Words:   0%|          | 0/90346 [00:00<?, ?it/s]

## Strip any Whitespace Characters from the ```text``` Column

In [None]:
df_clean["text"] = df_clean["text"].str.strip()

## Remove Blank Comments

In [None]:
df_clean.drop(df_clean[df_clean['text'].str.len() == 0].index, axis=0, inplace=True)
df_clean.reset_index(drop=True, inplace=True)

# Cleaned Dataset Details

In [None]:
df_clean.count()

text    90345
hate    90345
dtype: int64

In [None]:
print(df.count()['text'] - df_clean.count()['text'], "rows were removed after pre-processing.")

155 rows were removed after pre-processing.


In [None]:
print("Maximum string length:", df_clean['text'].str.len().max())
print("Minimum string length:", df_clean['text'].str.len().min())

Maximum string length: 3463
Minimum string length: 2


In [None]:
print("Maximum number of words in a sentence :", df_clean['text'].str.split().str.len().max())
print("Minimum number of words in a sentence :", df_clean['text'].str.split().str.len().min())

Maximum number of words in a sentence : 572
Minimum number of words in a sentence : 1


# Saving the Cleaned Dataset

In [None]:
df_clean_name = 'cleaned_'+dataset_file_location.rsplit('/', 1)[-1].rsplit('.')[0]
df_folder_name = dataset_file_location.rsplit('/', 1)[0]
df_clean.to_csv(df_folder_name+'/'+df_clean_name+'.csv', index=False)