# Amharic Hate Speech Detector using Random Forest Classifier

## Preprocessing

### Importing Dataset

In [106]:
import re
import pandas
import string
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [107]:
%matplotlib inline

In [108]:
DATASET_FILEPATH = "datasets/dataset.xls"
raw_data = pandas.read_excel(DATASET_FILEPATH, sheet_name='sheet')

raw_data = shuffle(raw_data)
raw_data.head()

Unnamed: 0,Comment No.,0,Comment,Like,(view source),Timestamp
1216,5.0,0,አሳዬ ጋር ስትዋደዱ እኮ,0,view comment,"Saturday, April 25, 2020 8:23pm GMT"
1979,,0,አሰር ሓየሎም እሳቸውማ ለ ሀገር ሰርተው ለፍተው አልፈዋል.... ያጠፉት...,0,view comment,"Wednesday, April 22, 2020 6:59pm GMT"
1572,,0,Mesay Tgm ante dedeb neh ende man new midefrew...,0,view comment,"Friday, April 10, 2020 4:02am GMT"
2327,206.0,0,መልካም በአል,0,view comment,"Sunday, April 19, 2020 1:42pm GMT"
2754,38.0,0,ኑርልን።እድሜና ጤና ይስጥህ,0,view comment,"Saturday, April 18, 2020 5:08pm GMT"


### Extracting Data

In [109]:
labels = raw_data.iloc[:, 1]
comments = raw_data.iloc[:, 2]

### Remove Alphabets

In [110]:
comments = [re.sub(r"[A-Za-z]", "", str(comment)) for comment in comments]

### Remove Numbers

In [111]:
comments = [re.sub('[0-9]', "", str(comment)) for comment in comments]

### Remove Punctuations

In [112]:
punctuations = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~፡።፣፥¡"
comments = [str(comment).translate(str.maketrans('', '', punctuations)) for comment in comments]

### Remove Whitespaces

In [113]:
#comments = [comment for comment in comments if comment]
comments = [(" ".join([ word for word in str(comment).split()])) for comment in comments]

### Remove Empty Comments

In [126]:
X = []
Y = []
number_of_comments = len(comments)
for i in range(number_of_comments):
    if comments[i].strip() and :
        X.append(comments[i])
        Y.append(labels[i])

## TF-IDF Encoding

In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [131]:
tfidf_vectorizer = TfidfVectorizer(max_features=500, min_df=5, max_df=0.7)
X = tfidf_vectorizer.fit_transform(X).toarray()

In [132]:
print("Training Dataset Shape: ", X.shape)
print("Training Label Shape: ", len(Y))

Training Dataset Shape:  (2577, 601)
Training Label Shape:  2577


## Split Dataset

In [133]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [134]:
print("X_train Shape: ", X_train.shape)
print("X_test Shape: ", X_test.shape)

X_train Shape:  (2061, 601)
X_test Shape:  (516, 601)


In [135]:
print("Y_train Shape: ", len(Y_train))
print("Y_test Shape: ", len(Y_test))

Y_train Shape:  2061
Y_test Shape:  516


## Random Forest Classifier

In [136]:
from sklearn.ensemble import RandomForestClassifier

In [137]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, Y_train) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

## Evaluate Model

In [139]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [140]:
Y_pred = classifier.predict(X_test)

### Confusion Matrix

In [None]:
print(confusion_matrix(Y_test, Y_pred))

### Classification Report

In [None]:
print(classification_report(Y_test, Y_pred))

### Accuracy Score

In [None]:
print(accuracy_score(Y_test, Y_pred))