# Models Used in this Python Notebook:
    - Word2Vec+SVM
    - Word2Vec+LSTM
    - Word2Vec+Bi-LSTM

In [1]:
import pandas as pd
import numpy as np
import re
import unicodedata
import nltk
import matplotlib.pyplot as plt
import string
import re
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
import sys
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from collections import Counter

#### Importing Dataset

In [None]:
df=pd.read_csv("Bengali_hate_speech.csv")

#### removing emojis

In [3]:
def emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [4]:
df['sentence']=df['sentence'].apply(emoji)

#### removing punctutaions

In [5]:
def punctuation(t1):
    whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
    bangla_fullstop = u"\u0964"
    punctSeq   = u"['\"“”‘’]+|[.?!,…]+|[:;]+"
    punc = u"[(),$%^&*+={}\[\]:\"|\'\~`<>/,¦!?½£¶¼©⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉¤¿º;-]+"
    t1= whitespace.sub(" ",t1).strip()
    t1 = re.sub(punctSeq, " ", t1)
    t1 = re.sub(bangla_fullstop, " ",t1)
    t1 = re.sub(punc, " ", t1)
    return t1

In [6]:
df['sentence']=df['sentence'].apply(punctuation)

#### replacing bengali numbers into english numbers

In [7]:
def replace_num(one):
    one = one.replace("০","0")
    one=one.replace("১","1")
    one=one.replace("২","2")
    one=one.replace("৩","3")
    one=one.replace("৪","4")
    one=one.replace("৫","5")
    one=one.replace("৬","6")
    one=one.replace("৭","7")
    one=one.replace("৮","8")
    one=one.replace("৯","9")
    return one

In [8]:
df['sentence']=df['sentence'].apply(replace_num)

#### removing non-bengali alphabets & numbers

In [9]:
def non_bengali(a):
    a = "".join(i for i in a if i in [".","।"] or 2432 <= ord(i) <= 2559 or ord(i)== 32)
    a=re.sub(' +', ' ', a)
    return a

In [10]:
df['sentence']=df['sentence'].apply(non_bengali)

In [11]:
df['sentence'][14436]

'আসলে অনেক পাইন হইছে বিসেশ কড়ে আখম ভাই অসাদরণ রাজ সৌইডি পবাশি '

In [12]:
import sys
from indicnlp import common

# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES=r"indic_nlp_resources"

# Add library to Python path
sys.path.append(r'{}\src'.format(INDIC_NLP_LIB_HOME))

# Set environment variable for resources folder
common.set_resources_path(INDIC_NLP_RESOURCES)

In [13]:
from indicnlp.tokenize import indic_tokenize  

print('Input String: {}'.format(df['sentence'][150]))
print('Tokens: ')
for t in indic_tokenize.trivial_tokenize(df['sentence'][150],lang='bn'): 
    print(t)

Input String: এই চুতিয়ার বাচ্চাটা উলটা পালটা কথা বলে 
Tokens: 
এই
চুতিয়ার
বাচ্চাটা
উলটা
পালটা
কথা
বলে


In [14]:
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

#### Normalizing Using IndicNLP

In [198]:
input_text=df['sentence'][18]
remove_nuktas=False
factory=IndicNormalizerFactory()
normalizer=factory.get_normalizer("bn")
output_text=normalizer.normalize(input_text)

print(input_text)
print()

print('Before normalization')
print(' '.join([ hex(ord(c)) for c in input_text ] ))
print('Length: {}'.format(len(input_text)))
print()    
print('After normalization')
print(' '.join([ hex(ord(c)) for c in output_text ] ))
print('Length: {}'.format(len(output_text)))  
print(output_text)

পাপনকে জুতার মালা দেয়া জরুর

Before normalization
0x9aa 0x9be 0x9aa 0x9a8 0x995 0x9c7 0x20 0x99c 0x9c1 0x9a4 0x9be 0x9b0 0x20 0x9ae 0x9be 0x9b2 0x9be 0x20 0x9a6 0x9c7 0x9df 0x9be 0x20 0x99c 0x9b0 0x9c1 0x9b0
Length: 27

After normalization
0x9aa 0x9be 0x9aa 0x9a8 0x995 0x9c7 0x20 0x99c 0x9c1 0x9a4 0x9be 0x9b0 0x20 0x9ae 0x9be 0x9b2 0x9be 0x20 0x9a6 0x9c7 0x9af 0x9bc 0x9be 0x20 0x99c 0x9b0 0x9c1 0x9b0
Length: 28
পাপনকে জুতার মালা দেয়া জরুর


In [None]:
def basic_clean(text):    
    words = text.split()
    return words

#### Splitting Sentence into Words

In [42]:
def basic_clean(text):    
    remove_nuktas=False
    factory=IndicNormalizerFactory()
    normalizer=factory.get_normalizer("bn")
    output_text=normalizer.normalize(text)
    words=indic_tokenize.trivial_tokenize(text,lang='bn')
    return words

In [43]:
words = basic_clean(''.join(str(df['sentence'].tolist())))

In [44]:
values=[]

size=df.shape[0]
for i in range(0,size):
        worddd=basic_clean(df['sentence'][i])
        values.append(worddd)     

In [18]:
size

30000

In [19]:
print(words[:10])

['[', "'", 'যত্তসব', 'পাপন', 'শালার', 'ফাজলামী', "'", ',', "'", 'পাপন']


In [17]:
print(values[:20])

[['যত্তসব', 'পাপন', 'শালার', 'ফাজলামী'], ['পাপন', 'শালা', 'রে', 'রিমান্ডে', 'নেওয়া', 'দরকার'], ['জিল্লুর', 'রহমান', 'স্যারের', 'ছেলে', 'এতো', 'বড়', 'জারজ', 'হবে', 'এটা', 'একটা', 'দেশের', 'মানুষ', 'কোনো', 'দিন', 'ও', 'ভাবতে', 'পারেনি', 'ধন্যবাদ', 'তাহসিন', 'ভাই'], ['শালা', 'লুচ্চা', 'দেখতে', 'পাঠার', 'মত', 'দেখা', 'যায়'], ['তুই', 'তো', 'শালা', 'গাজা', 'খাইছচ', 'তুর', 'মার', 'হেডায়', 'খেলবে', 'সাকিব'], ['এটা', 'কুন', 'দরনের', 'কেলা', 'ফাইজলামি', 'তাস্কিন', 'রে', 'চর', 'মারা', 'দরকার'], ['পাপন', 'ভর', 'মাদা', 'চোদ', 'পাপনে', 'পদতেক', 'চাই'], ['দুরো', 'সালার', 'পুদ', 'চুপথাক'], ['কুত্তার', 'বাছচা', 'পাপন'], ['বাল', 'ছাল', 'তর', 'সাউয়া'], ['তোর', 'কপালে', 'জুতা', 'মারি', 'শালার', 'পুত'], ['পাপনে', 'পাগল', 'হয়াছে'], ['দেখেছো', 'মাগি', 'না', 'হিজরা', 'বোঝা', 'যাচ্ছে', 'না', 'দেখে', 'মনে', 'হচ্ছে', 'হিজরা'], ['হালার', 'পু', 'পাপন'], ['শালায়', 'এই', 'কারণেই', 'খেলায়', 'ডাব্বা', 'মারে'], ['ব্ব'], ['পাপন', 'হালারে', 'বেট', 'থেকে', 'জুতার', 'মালা', 'দিয়এই', 'বের', 'করে', 'না', 'কেনে', 'ওর', 'কারনে

In [21]:
print(values[19])

['আমি', 'হলে', 'তো', 'জারসি', 'পড়া', 'লোকটাকে', 'চুদেদিতাম']


#### Creating Word2Vec Model with Vector Size of 300

In [45]:
model = Word2Vec(values, size=300, window=5, min_count=5,workers=6,sg=0)

In [46]:
model.wv.most_similar('খেলা', topn=5)

[('করলে', 0.9925315976142883),
 ('ভয়েস', 0.9813070297241211),
 ('সমালোচনা', 0.9794937372207642),
 ('কারণ', 0.9774117469787598),
 ('অডিশনে', 0.9771517515182495)]

In [20]:
docs_vectors = pd.DataFrame()
i=0
for doc in df['sentence']:
    #print(i)
    #i=i+1
    temp = pd.DataFrame()
    for word_a in doc.split():
        try:
            word_vec=model.wv[word_a]
            temp=temp.append(pd.Series(word_vec),ignore_index=True)  
        except:
            pass
    doc_vector = temp.mean()
    docs_vectors=docs_vectors.append(doc_vector,ignore_index=True)
    
docs_vectors['hate'] = df['hate']
docs_vectors = docs_vectors.dropna()
train_x, test_x, train_y, test_y = train_test_split(docs_vectors.drop('hate', axis = 1),
                                                   docs_vectors['hate'],
                                                   test_size = 0.2,
                                                   random_state = 1)   

### SVM Classifier

In [21]:
clf = svm.SVC(kernel='linear') 
clf.fit(train_x, train_y)

#Predict the response for test dataset
test_pred = clf.predict(test_x)
accuracy=accuracy_score(test_y, test_pred)
print('Accuracy: %f' % accuracy)

Accuracy: 0.768503


In [55]:
filename="embedding_word2vec.txt"
model.wv.save_word2vec_format(filename,binary=False)

In [56]:
import os

embeddings_index={}
f=open(os.path.join('','embedding_word2vec.txt'))
for line in f:
    value=line.split()
    word=value[0]
    coefs=np.asarray(value[1:])
    embeddings_index[word]=coefs
f.close()

In [47]:
max_length = max([len(s.split()) for s in df['sentence']])

In [48]:
max_length

534

In [62]:
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(df['sentence'])
sequences = tokenizer_obj.texts_to_sequences(df['sentence'])

word_index=tokenizer_obj.word_index
print('Found %s unique tokens.'%len(word_index))

review_pad=pad_sequences(sequences,maxlen=max_length)
sentiment=df['hate'].values
print('shape of review tensor:',review_pad.shape)
print('shape of sentiment tensor:', sentiment.shape)

Found 47441 unique tokens.
shape of review tensor: (30000, 534)
shape of sentiment tensor: (30000,)


In [45]:
len(word_index)

47441

In [57]:
# Count all the words using Counter Method
##
words=[]
for val in values:
    words.extend(val)
count_words = Counter(words)

total_words = len(words)
sorted_words = count_words.most_common(total_words)
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
reviews_int = []
for val in values:
    r = [vocab_to_int[w] for w in val]
    reviews_int.append(r)
print (reviews_int[0:3])
review_pad=pad_sequences(reviews_int,maxlen=max_length)
sentiment=df['hate'].values

[[3062, 67, 292, 12659], [67, 97, 171, 939, 804, 54], [7014, 338, 301, 167, 172, 68, 363, 15, 50, 7, 75, 32, 124, 76, 13, 1743, 1320, 126, 1943, 14]]


In [58]:
##
sorted_words

[('না', 7211),
 ('এই', 4804),
 ('করে', 4288),
 ('আর', 4045),
 ('কি', 3894),
 ('কে', 2847),
 ('একটা', 2254),
 ('আমি', 2159),
 ('কথা', 2092),
 ('আমার', 2071),
 ('জন্য', 1883),
 ('যে', 1873),
 ('ও', 1867),
 ('ভাই', 1822),
 ('হবে', 1791),
 ('তো', 1732),
 ('সব', 1725),
 ('আপনার', 1706),
 ('হয়', 1699),
 ('ভালো', 1610),
 ('তার', 1606),
 ('এর', 1599),
 ('থেকে', 1572),
 ('আছে', 1571),
 ('নাই', 1530),
 ('কোন', 1529),
 ('আপনি', 1497),
 ('অনেক', 1464),
 ('কিছু', 1425),
 ('সাথে', 1414),
 ('তুই', 1398),
 ('মানুষ', 1387),
 ('করা', 1354),
 ('মনে', 1302),
 ('করতে', 1271),
 ('চাই', 1224),
 ('বলে', 1166),
 ('কেন', 1145),
 ('দিয়ে', 1136),
 ('তোর', 1133),
 ('আমাদের', 1132),
 ('বাচ্চা', 1127),
 ('এখন', 1123),
 ('নিয়ে', 1089),
 ('স্যার', 1073),
 ('যদি', 1034),
 ('আমরা', 1000),
 ('কিন্তু', 988),
 ('এক', 986),
 ('এটা', 975),
 ('হয়ে', 955),
 ('এ', 929),
 ('মত', 925),
 ('দরকার', 919),
 ('আল্লাহ', 865),
 ('ভিডিও', 858),
 ('টা', 852),
 ('সে', 848),
 ('তাদের', 847),
 ('সবাই', 838),
 ('তাহলে', 824),
 ('উচিত', 815),


In [229]:
len(word_index)

47441

In [35]:
num_words=47445

In [67]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words, 300))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]= embedding_vector

In [60]:
embedding_matrix=np.zeros((len(sorted_words)+1, 300))
for word, i in sorted_words:
    if i > num_words:
        continue
    try:    
        embedding_vector = model.wv[word]
    except:
        pass
    if embedding_vector is not None:
        embedding_matrix[i]= embedding_vector

In [37]:
len(embedding_matrix)

47443

In [49]:
embedding_vector = embeddings_index.get('না')
len(embedding_vector)

NameError: name 'embeddings_index' is not defined

In [38]:
len(embedding_matrix)

47443

### LSTM Model

In [70]:
modell=Sequential()
embedding_layer = Embedding(num_words, 300,input_length=max_length)
modell.add(embedding_layer)
modell.add(LSTM(units=100,dropout=0.2,recurrent_dropout=0.2))
modell.add(Dense(1,activation='sigmoid'))

modell.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [68]:
##
modelll=Sequential()
embedding_layer = Embedding(num_words, 300, embeddings_initializer=Constant(embedding_matrix),input_length=max_length,trainable=False)
modelll.add(embedding_layer)
modelll.add(LSTM(units=64,dropout=0.2,recurrent_dropout=0.2))
modelll.add(Dense(1,activation='sigmoid'))

modelll.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [71]:
VALIDATION_SPLIT=0.2
indices=np.arange(review_pad.shape[0])
np.random.shuffle(indices)
sentiment=sentiment[indices]
num_validation_samples=int(VALIDATION_SPLIT*review_pad.shape[0])
x_trainn_pad=review_pad[:-num_validation_samples]
y_trainn=sentiment[:-num_validation_samples]
x_testt_pad=review_pad[-num_validation_samples:]
y_testt=sentiment[-num_validation_samples:]

In [72]:
modell.fit(x_trainn_pad,y_trainn,validation_data=(x_testt_pad,y_testt),batch_size=64,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

KeyboardInterrupt: 

In [281]:
test_pred = modelll.predict(x_testt_pad)
accuracy=accuracy_score(y_testt, test_pred.round())
print('Accuracy: %f' % accuracy)

Accuracy: 0.673500


In [62]:
test_pred = modell.predict(x_testt_pad)
accuracy=accuracy_score(y_testt, test_pred.round())
print('Accuracy: %f' % accuracy)

Accuracy: 0.559000


### Bi-LSTM Model

In [74]:
from keras.layers import Dense, Embedding, LSTM, GRU, Bidirectional

In [75]:
modelll=Sequential()
embedding_layer = Embedding(num_words, 300, embeddings_initializer=Constant(embedding_matrix),input_length=max_length,trainable=False)
modelll.add(embedding_layer)
modelll.add(Bidirectional(LSTM(units=64,dropout=0.2,recurrent_dropout=0.2)))
modelll.add(Dense(1,activation='sigmoid'))

modelll.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [76]:
modelll.fit(x_trainn_pad,y_trainn,validation_data=(x_testt_pad,y_testt),batch_size=64,epochs=5)

Epoch 1/5
Epoch 2/5
  1/375 [..............................] - ETA: 12:31 - loss: 0.5743 - accuracy: 0.7656

KeyboardInterrupt: 