# Amharic Hate Speech Detector using Naive Bayes

## Preprocessing

### Importing Dataset

In [30]:
import re
import math
import pandas
import string
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [31]:
%matplotlib inline

In [32]:
DATASET_FILEPATH = "datasets/dataset.xls"
raw_data = pandas.read_excel(DATASET_FILEPATH, sheet_name='sheet')

raw_data = shuffle(raw_data)
raw_data.head()

Unnamed: 0,Comment No.,0,Comment,Like,(view source),Timestamp
3389,60.0,0.0,እሱን ተወዉ ይልቅ የአመት እርፍት ልዉጣ በጣት የሚቆጠር ቀን ስለቀረኝ ሌ...,2,view comment,"Sunday, April 19, 2020 9:17am GMT"
1968,,0.0,Dawit Tadesse ይሄ ደም ለማንም ሊሰጥ እንደማይችል አለማወቅህ ነው,7,view comment,"Wednesday, April 22, 2020 10:48am GMT"
272,23.0,0.0,"I really appreciate you Fitse, you are too cri...",0,view comment,"Monday, April 20, 2020 8:16am GMT"
2829,10.0,0.0,ከሰባተኛው ንጉስም አያንስም!! አገር ለማፈራረስ ደሞ!!,4,view comment,"Monday, April 27, 2020 11:15pm GMT"
1628,,0.0,Fetsum Berhane Dire ትክክል እኛ 29 አድርገን ሪከርዱን ሰበ...,0,view comment,"Saturday, April 18, 2020 4:13pm GMT"


### Extracting Data

In [33]:
labels = raw_data.iloc[:, 1]
comments = raw_data.iloc[:, 2]

### Remove Alphabets

In [34]:
comments = [re.sub(r"[A-Za-z]", "", str(comment)) for comment in comments]

### Remove Numbers

In [35]:
comments = [re.sub('[0-9]', "", str(comment)) for comment in comments]

### Remove Punctuations

In [36]:
punctuations = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~፡።፣፥¡"
comments = [str(comment).translate(str.maketrans('', '', punctuations)) for comment in comments]

### Remove Whitespaces

In [37]:
comments = [(" ".join([ word for word in str(comment).split()])) for comment in comments]

### Remove Empty Comments

In [38]:
X = []
Y = []

number_of_comments = len(comments)
for i in range(number_of_comments):
    if comments[i].strip() and not math.isnan(float(labels[i])):
        X.append(comments[i])
        Y.append(labels[i])

### Replace Amharic Characters

In [60]:
character_mapping = {
    "ሐ" : "ሀ", "ሑ" : "ሁ", "ሒ" : "ሂ", "ሓ" : "ሀ", "ሔ" : "ሄ", "ሕ" : "ህ", "ሖ" : "ሆ", "ሃ" : "ሆ",
    "ኀ" : "ሀ", "ኁ" : "ሁ", "ኂ" : "ሂ", "ኃ" : "ሀ", "ኄ" : "ሄ", "ኅ" : "ህ", "ኆ" : "ሆ" ,
    "ሠ" : "ሰ", "ሡ" : "ሱ", "ሢ" : "ሲ", "ሣ" : "ሳ", "ሤ" : "ሴ", "ሥ" : "ስ", "ሦ" : "ሶ" ,
    "ዐ" : "አ", "ዑ" : "ኡ", "ዒ" : "ኢ", "ዓ" : "አ", "ዔ" : "ኤ", "ዕ" : "እ", "ዖ" : "ኦ", "ኣ" : "አ",
    "ጸ" : "ፀ", "ጹ" : "ፁ", "ጺ" : "ፂ", "ጻ" : "ፃ", "ጼ" : "ፄ", "ጽ" : "ፅ", "ጾ" : "ፆ"
}

In [44]:
def replace_characters(comment, mapping):
    for character in comment:
        if character in mapping:
            comment = comment.replace(character, mapping[character])
    return comment

In [46]:
comments = [replace_characters(comment, character_mapping) for comment in comments]

## Bag of Words Encoding

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
count_vectorizer = CountVectorizer(max_features=1000, min_df=5, max_df=0.7)
X = count_vectorizer.fit_transform(X).toarray()

In [49]:
print("Training Dataset Shape: ", X.shape)
print("Training Label Shape: ", len(Y))

Training Dataset Shape:  (2478, 567)
Training Label Shape:  2478


## Split Dataset

In [50]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [51]:
print("X_train Shape: ", X_train.shape)
print("X_test Shape: ", X_test.shape)

X_train Shape:  (1982, 567)
X_test Shape:  (496, 567)


In [52]:
print("Y_train Shape: ", len(Y_train))
print("Y_test Shape: ", len(Y_test))

Y_train Shape:  1982
Y_test Shape:  496


## Naive Bayes

In [53]:
from sklearn.naive_bayes import GaussianNB

In [54]:
classifier = GaussianNB()
classifier.fit(X_train, Y_train) 

GaussianNB(priors=None, var_smoothing=1e-09)

## Evaluate Model

In [55]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [56]:
Y_pred = classifier.predict(X_test)

### Confusion Matrix

In [57]:
print(confusion_matrix(Y_test, Y_pred))

[[329  73  76]
 [ 14   1   3]
 [  0   0   0]]


### Classification Report

In [58]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

         0.0       0.96      0.69      0.80       478
         1.0       0.01      0.06      0.02        18
        11.0       0.00      0.00      0.00         0

    accuracy                           0.67       496
   macro avg       0.32      0.25      0.27       496
weighted avg       0.92      0.67      0.77       496



  _warn_prf(average, modifier, msg_start, len(result))


### Accuracy Score

In [59]:
print(accuracy_score(Y_test, Y_pred))

0.6653225806451613
