# Amharic Hate Speech Detector using Naive Bayes

## Preprocessing

### Importing Dataset

In [1]:
import re
import math
import pandas
import string
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
%matplotlib inline

In [3]:
DATASET_FILEPATH = "datasets/dataset.xls"
raw_data = pandas.read_excel(DATASET_FILEPATH, sheet_name='sheet')

raw_data = shuffle(raw_data)
raw_data.head()

Unnamed: 0,Comment No.,0,Comment,Like,(view source),Timestamp
2896,66.0,0.0,ሙድ አለመሆኑ ነው?,0,view comment,"Tuesday, April 28, 2020 11:17am GMT"
1929,74.0,0.0,ዛሬ በትግራይ ጤና ምርምር ተቋም(THRI) የኮሮና ምርመራ የተደረገባቸው ...,0,view comment,"Sunday, April 12, 2020 2:09pm GMT"
420,,0.0,Tigray Ethiopia እኔ ገፅ ላይ የሰው ስም አታጥፋ,2,view comment,"Saturday, April 4, 2020 1:20pm GMT"
2105,47.0,0.0,ታድለናል ሁሉም የኛ!! ኦ እግዚአብሔር እስከ መቼ ነው ከነዚህ ጋር ተጣብ...,0,view comment,"Saturday, April 25, 2020 1:34pm GMT"
2869,42.0,0.0,ንጉስ ሆይ ሺ አመት ንገስልን ፡ best miracle of the era.,0,view comment,"Tuesday, April 28, 2020 3:51am GMT"


### Extracting Data

In [4]:
labels = raw_data.iloc[:, 1]
comments = raw_data.iloc[:, 2]

### Remove Alphabets

In [5]:
comments = [re.sub(r"[A-Za-z]", "", str(comment)) for comment in comments]

### Remove Numbers

In [6]:
comments = [re.sub('[0-9]', "", str(comment)) for comment in comments]

### Remove Punctuations

In [7]:
punctuations = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~፡።፣፥¡"
comments = [str(comment).translate(str.maketrans('', '', punctuations)) for comment in comments]

### Remove Whitespaces

In [8]:
comments = [(" ".join([ word for word in str(comment).split()])) for comment in comments]

### Remove Empty Comments

In [9]:
X = []
Y = []

number_of_comments = len(comments)
for i in range(number_of_comments):
    if comments[i].strip() and not math.isnan(float(labels[i])):
        X.append(comments[i])
        Y.append(labels[i])

## Bag of Words Encoding

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
count_vectorizer = CountVectorizer(max_features=1000, min_df=5, max_df=0.7)
X = count_vectorizer.fit_transform(X).toarray()

In [12]:
print("Training Dataset Shape: ", X.shape)
print("Training Label Shape: ", len(Y))

Training Dataset Shape:  (2488, 576)
Training Label Shape:  2488


## Split Dataset

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [14]:
print("X_train Shape: ", X_train.shape)
print("X_test Shape: ", X_test.shape)

X_train Shape:  (1990, 576)
X_test Shape:  (498, 576)


In [15]:
print("Y_train Shape: ", len(Y_train))
print("Y_test Shape: ", len(Y_test))

Y_train Shape:  1990
Y_test Shape:  498


## Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB

In [19]:
classifier = GaussianNB()
classifier.fit(X_train, Y_train) 

GaussianNB(priors=None, var_smoothing=1e-09)

## Evaluate Model

In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [21]:
Y_pred = classifier.predict(X_test)

### Confusion Matrix

In [22]:
print(confusion_matrix(Y_test, Y_pred))

[[343  72  61]
 [ 16   4   2]
 [  0   0   0]]


### Classification Report

In [23]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

         0.0       0.96      0.72      0.82       476
         1.0       0.05      0.18      0.08        22
        11.0       0.00      0.00      0.00         0

    accuracy                           0.70       498
   macro avg       0.34      0.30      0.30       498
weighted avg       0.92      0.70      0.79       498



  _warn_prf(average, modifier, msg_start, len(result))


### Accuracy Score

In [24]:
print(accuracy_score(Y_test, Y_pred))

0.6967871485943775
