# Models Used in this Python Notebook:
    - FastText+SVM
    - FastText+LSTM
    - FastText+Bi-LSTM

In [50]:
import pandas as pd
import numpy as np
import re
import unicodedata
import nltk
import matplotlib.pyplot as plt
import string
import re
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec
from gensim.models import FastText
from sklearn.feature_extraction.text import CountVectorizer
import sys
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from collections import Counter

#### Importing Dataset

In [3]:
df=pd.read_csv("Bengali_hate_speech.csv")

#### removing emojis

In [4]:
def emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [5]:
df['sentence']=df['sentence'].apply(emoji)

#### removing punctutaions

In [6]:
def punctuation(t1):
    whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
    bangla_fullstop = u"\u0964"
    punctSeq   = u"['\"“”‘’]+|[.?!,…]+|[:;]+"
    punc = u"[(),$%^&*+={}\[\]:\"|\'\~`<>/,¦!?½£¶¼©⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉¤¿º;-]+"
    t1= whitespace.sub(" ",t1).strip()
    t1 = re.sub(punctSeq, " ", t1)
    t1 = re.sub(bangla_fullstop, " ",t1)
    t1 = re.sub(punc, " ", t1)
    return t1

In [7]:
df['sentence']=df['sentence'].apply(punctuation)

#### replacing bengali numbers into english numbers

In [8]:
def replace_num(one):
    one = one.replace("০","0")
    one=one.replace("১","1")
    one=one.replace("২","2")
    one=one.replace("৩","3")
    one=one.replace("৪","4")
    one=one.replace("৫","5")
    one=one.replace("৬","6")
    one=one.replace("৭","7")
    one=one.replace("৮","8")
    one=one.replace("৯","9")
    return one

In [9]:
df['sentence']=df['sentence'].apply(replace_num)

#### removing non-bengali alphabets & numbers

In [10]:
def non_bengali(a):
    a = "".join(i for i in a if i in [".","।"] or 2432 <= ord(i) <= 2559 or ord(i)== 32)
    a=re.sub(' +', ' ', a)
    return a

In [11]:
df['sentence']=df['sentence'].apply(non_bengali)

#### Splitting The sentences into words

In [12]:
def basic_clean(text):    
    words = text.split()
    return words

In [13]:
values=[]
size=df.shape[0]
for i in range(0,size):
        worddd=basic_clean(df['sentence'][i])
        values.append(worddd)     

In [14]:
values[1]

['পাপন', 'শালা', 'রে', 'রিমান্ডে', 'নেওয়া', 'দরকার']

### Creating FastText Model with Vector Size of 300

In [15]:
model2 = FastText(size=300, window=5, min_count=5,sentences=values,sg=1,workers=8)

In [16]:
docs_vectors = pd.DataFrame()
i=0
for doc in df['sentence']:
#     print(i)
#     i=i+1
    temp = pd.DataFrame()
    for word_a in doc.split():
        try:
            word_vec=model2.wv[word_a]
            temp=temp.append(pd.Series(word_vec),ignore_index=True)  
        except:
            pass
    doc_vector = temp.mean()
    docs_vectors=docs_vectors.append(doc_vector,ignore_index=True)
    
docs_vectors['hate'] = df['hate']
docs_vectors = docs_vectors.dropna()
train_x, test_x, train_y, test_y = train_test_split(docs_vectors.drop('hate', axis = 1),
                                                   docs_vectors['hate'],
                                                   test_size = 0.2,
                                                   random_state = 1)   

### SVM Classifier

In [17]:
clf = svm.SVC(kernel='linear') 
clf.fit(train_x, train_y)

#Predict the response for test dataset
test_pred = clf.predict(test_x)
accuracy=accuracy_score(test_y, test_pred)
print('Accuracy: %f' % accuracy)

Accuracy: 0.808887


### LSTM Model

In [59]:
filename="embedding_word2vec2.txt"
model2.wv.save_word2vec_format(filename,binary=False)

In [60]:
import os

embeddings_index={}
f=open(os.path.join('','embedding_word2vec2.txt'))
for line in f:
    value=line.split()
    word=value[0]
    coefs=np.asarray(value[1:])
    embeddings_index[word]=coefs
f.close()

In [38]:
max_length = max([len(s.split()) for s in df['sentence']])

In [39]:
#added
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(df['sentence'])
sequences = tokenizer_obj.texts_to_sequences(df['sentence'])

word_index=tokenizer_obj.word_index
print('Found %s unique tokens.'%len(word_index))

review_pad=pad_sequences(sequences,maxlen=max_length)
sentiment=df['hate'].values
print('shape of review tensor:',review_pad.shape)
print('shape of sentiment tensor:', sentiment.shape)

Found 47441 unique tokens.
shape of review tensor: (30000, 534)
shape of sentiment tensor: (30000,)


In [40]:
# Count all the words using Counter Method
##
words=[]
for val in values:
    words.extend(val)
count_words = Counter(words)

total_words = len(words)
sorted_words = count_words.most_common(total_words)
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
reviews_int = []
for val in values:
    r = [vocab_to_int[w] for w in val]
    reviews_int.append(r)
print (reviews_int[0:3])
review_pad=pad_sequences(reviews_int,maxlen=max_length)
sentiment=df['hate'].values

[[3061, 67, 292, 12658], [67, 97, 171, 938, 804, 54], [7013, 338, 301, 167, 172, 68, 363, 15, 50, 7, 75, 32, 124, 76, 13, 1742, 1319, 126, 1942, 14]]


In [41]:
num_words=47445

In [44]:
#changed
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words, 300))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]= embedding_vector

In [45]:
modell=Sequential()
embedding_layer = Embedding(num_words, 300, embeddings_initializer=Constant(embedding_matrix),input_length=max_length,trainable=False)
modell.add(embedding_layer)
modell.add(LSTM(units=100,dropout=0.2,recurrent_dropout=0.2))
modell.add(Dense(1,activation='sigmoid'))

modell.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

### Bi-LSTM Model

In [53]:
modelll=Sequential()
embedding_layer = Embedding(num_words, 300, embeddings_initializer=Constant(embedding_matrix),input_length=max_length,trainable=False)
modelll.add(embedding_layer)
modelll.add(Bidirectional(LSTM(units=64,dropout=0.2,recurrent_dropout=0.2)))
modelll.add(Dense(1,activation='sigmoid'))

modelll.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [46]:
VALIDATION_SPLIT=0.2
indices=np.arange(review_pad.shape[0])
np.random.shuffle(indices)
sentiment=sentiment[indices]
num_validation_samples=int(VALIDATION_SPLIT*review_pad.shape[0])
x_trainn_pad=review_pad[:-num_validation_samples]
y_trainn=sentiment[:-num_validation_samples]
x_testt_pad=review_pad[-num_validation_samples:]
y_testt=sentiment[-num_validation_samples:]

In [47]:
modell.fit(x_trainn_pad,y_trainn,batch_size=64,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fea563c6820>

In [48]:
test_pred = modell.predict(x_testt_pad)
accuracy=accuracy_score(y_testt, test_pred.round())
print('Accuracy: %f' % accuracy)

Accuracy: 0.672167


In [54]:
modelll.fit(x_trainn_pad,y_trainn,batch_size=64,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fea6c0f4160>

In [56]:
test_pred = modelll.predict(x_testt_pad)
accuracy=accuracy_score(y_testt, test_pred.round())
print('Accuracy: %f' % accuracy)

Accuracy: 0.672167


In [57]:
train_x[1]

6141     0.076326
4021     0.065728
5877    -0.044913
10869    0.042293
8857     0.031603
           ...   
17300    0.090350
5192     0.077148
12182   -0.028225
235      0.036286
29804    0.056819
Name: 1, Length: 23943, dtype: float32