<a href="https://colab.research.google.com/github/ashalogic/Persian-Sentiment-Analyzer/blob/master/Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tutorial : Persian Sentiment Analysis With LSTM


---

Step by step from dataset to ready to use model

In [0]:
#@title Download and load word embedding model
modelName = "Fasttext" #@param ["Fasttext"]
#@markdown ###Or you can use your own pretrained model
modelURL = "" #@param {type:"string"}
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz
!gunzip /content/cc.fa.300.bin.gz
!pip install fasttext

import fasttext 

%time
model = fasttext.load_model("/content/cc.fa.300.bin")

In [0]:
#@title Download and prepare Dataset
# !wget https://raw.githubusercontent.com/minasmz/Sentiment-Analysis-with-LSTM-in-Persian/master/totalReviewWithSuggestion.csv

import pandas
import random
import numpy
import hazm

def CleanPersianText(text):
  _normalizer = hazm.Normalizer()
  text = _normalizer.normalize(text)
  return text

csv_dataset = pandas.read_csv("/content/totalReviewWithSuggestion.csv")
revlist = list(map(lambda x: [CleanPersianText(x[0]),x[1]],zip(csv_dataset['Text'],csv_dataset['Suggestion'])))
pos=list(filter(lambda x: x[1] == 1,revlist))
nat=list(filter(lambda x: x[1] == 2,revlist))
neg=list(filter(lambda x: x[1] == 3,revlist))
print("Posetive count {}".format(len(pos)))
print("Negetive count {}".format(len(neg)))
print("Natural  count {}".format(len(nat)))
print()
print("Total    count {}".format(len(revlist)))
print()
print("Posetive sample : ","\n",pos[random.randrange(1,len(pos))])
print("Negetive sample : ","\n",neg[random.randrange(1,len(neg))])
print("Natural  sample : ","\n",nat[random.randrange(1,len(nat))])

revlist_shuffle = pos[:450] + neg[:450]
random.shuffle(revlist_shuffle)
print(len(revlist_shuffle))
# Not Important
# revdict = dict(zip(csv_dataset['Text'],csv_dataset['Suggestion']))
# revlist = [ [k,v] for k, v in revdict.items() ]
# labels = csv_dataset['Score']
# labels2 = numpy.array([1 if each > 3 else 0 for each in labels])

In [0]:
#@title Prepare Train & Test Data
vector_size = 300 #@param {type:"integer"}
max_no_tokens = 20 #@param {type:"integer"}
import numpy as np
import keras.backend as K
train_size = int(0.9*(len(revlist_shuffle)))
test_size = int(0.1*(len(revlist_shuffle)))

indexes = set(np.random.choice(len(revlist_shuffle), train_size + test_size, replace=False))

x_train = np.zeros((train_size, max_no_tokens, vector_size), dtype=K.floatx())
y_train = np.zeros((train_size, 2), dtype=np.int32)

x_test = np.zeros((test_size, max_no_tokens, vector_size), dtype=K.floatx())
y_test = np.zeros((test_size, 2), dtype=np.int32)

In [0]:
#@title Fill X_Train, X_Test, Y_Train, Y_Test with Dataset
for i, index in enumerate(indexes):
  text_words = hazm.word_tokenize(revlist_shuffle[index][0])
  for t in range(0,len(text_words)):
    if t >= max_no_tokens:
      break
    
    if text_words[t] not in model.words:
      continue
    if i < train_size:
      x_train[i, t, :] = model.get_word_vector(text_words[t])
    else:
      x_test[i - train_size, t, :] = model.get_word_vector(text_words[t])

  if i < train_size:
    y_train[i, :] = [1.0, 0.0] if revlist_shuffle[index][1] == 3 else [0.0, 1.0]
  else:
    y_test[i - train_size, :] = [1.0, 0.0] if revlist_shuffle[index][1] == 3 else [0.0, 1.0]
    
x_train.shape,x_test.shape,y_train.shape,y_test.shape

In [0]:
#@title Set batchSize and epochs
batch_size = 500 #@param {type:"integer"}
no_epochs = 100 #@param {type:"integer"}
w2v_model = model
del model

In [0]:
#@title Prepare LSTM Model
from keras.models import Sequential
from keras.layers import Conv1D, Dropout, Dense, Flatten, LSTM, MaxPooling1D, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, TensorBoard


model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same',
                 input_shape=(max_no_tokens, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=3))

model.add(Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.3)))

model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.25))

model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001, decay=1e-6), metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='logs/', histogram_freq=0, write_graph=True, write_images=True)

model.summary()

In [0]:
model.fit(x_train, y_train, batch_size=batch_size, shuffle=True, epochs=no_epochs,
         validation_data=(x_test, y_test))

In [0]:
model.metrics_names

In [0]:
model.evaluate(x=x_test, y=y_test, batch_size=32, verbose=1)

In [0]:
model.save('twitter-sentiment-fasttext.model')

In [180]:
#@title Lets test our model with complex negative Review !
user_text = "\u0628\u0627\u062A\u0631\u06CC\u0634 \u0632\u0648\u062F \u062E\u0627\u0644\u06CC \u0645\u06CC\u0634\u0647 \u062F\u0648\u0631\u0628\u06CC\u0646 \u0627\u0634 \u06A9\u06CC\u0641\u06CC\u062A \u0627\u0634 \u062F\u0631 \u062D\u062F 13 \u0646\u06CC\u0633\u062A \u0647\u0646\u062F\u0641\u0631\u06CC \u06A9\u0647 \u062F\u0627\u062E\u0644\u0634 \u06AF\u0630\u0627\u0634\u062A\u0646 \u0648\u0627\u0642\u0639\u0627 \u0628\u06CC \u06A9\u06CC\u0641\u06CC\u062A\u0647 \u060C \u0645\u0627\u0644 \u0646\u0648\u06A9\u06CC\u064700 \u0647\u0645\u06CC\u0646\u0637\u0648\u0631 \u0634\u0627\u0631\u0698\u0631 \u0648\u0644\u06CC \u062E\u0628 \u062F\u0627\u062E\u0644\u0634 \u06CC\u0647 \u0642\u0627\u0628 \u0698\u0644\u0647 \u0627\u06CC \u0648 \u06AF\u0644\u0633 \u0645\u0639\u0645\u0648\u0644\u06CC \u0628\u0648\u062F \u0634\u0627\u06CC\u062F \u0627\u06AF\u0631 \u0648\u0636\u0639\u06CC\u062A \u0627\u0642\u062A\u0635\u0627\u062F \u0627\u06CC\u0646\u0637\u0648\u0631\u06CC \u0646\u0628\u0648\u062F \u0646\u0635\u0641 \u0642\u06CC\u0645\u062A \u0645\u06CC\u0634\u062F \u062E\u0631\u06CC\u062F\u0634" #@param {type:"string"}
from IPython.core.display import display, HTML
if not user_text=="":
  text_for_test = _normalizer.normalize(user_text)
  text_for_test_words = _wordtokenizer.tokenize(text_for_test)
  x_text_for_test_words = np.zeros((1,max_no_tokens,vector_size),dtype=K.floatx())
  for t in range(0,len(text_for_test_words)):
    if t >= max_no_tokens:
      break
    if text_for_test_words[t] not in w2v_model.words:
      continue
    
    x_text_for_test_words[0, t, :] = w2v_model.get_word_vector(text_for_test_words[t])
  # print(x_text_for_test_words.shape)
  # print(text_for_test_words)
  result = model.predict(x_text_for_test_words)
  pos_percent = str(int(result[0][1]*100))+" % "
  neg_percent = str(int(result[0][0]*100))+" % "
  display(HTML("<div style='text-align: center'><div style='display:inline-block'><img src='https://www.paralleldots.com/static/images/positive.png'/><h4>{}</h4></div> | <div style='display:inline-block'><img src='https://www.paralleldots.com/static/images/negative.png'/><h4>{}</h4></div></div>".format(pos_percent,neg_percent)))
else:
  print("Please enter your text")

In [0]:
#@title Lets test our model with complex negative Review !
user_text = "\u062E\u06CC\u0644\u06CC \u06AF\u0648\u0634\u06CC\u0647 \u062E\u0648\u0628\u06CC\u0647. \u062A\u0634\u062E\u06CC\u0635 \u0686\u0647\u0631\u0647 \u062F\u0627\u0631\u0647. \u062F\u0627\u062E\u0644 \u062C\u0639\u0628\u0647 \u06A9\u0627\u0648\u0631 \u06AF\u0648\u0634\u06CC \u0648 \u0645\u062D\u0627\u0641\u0638 \u0635\u0641\u062D\u0647 \u062F\u0627\u0631\u0647. \u0645\u0646 \u062F\u06CC\u0631\u0648\u0632 \u0628\u0647 \u062F\u0633\u062A\u0645 \u0631\u0633\u06CC\u062F\u0647 \u0639\u0627\u0644\u06CC\u0647 \u0645\u0631\u0633\u06CC \u0627\u0632 \u062F\u06CC\u062C\u06CC \u06A9\u0627\u0644\u0627" #@param {type:"string"}
from IPython.core.display import display, HTML
if not user_text=="":
  text_for_test = _normalizer.normalize(user_text)
  text_for_test_words = _wordtokenizer.tokenize(text_for_test)
  x_text_for_test_words = np.zeros((1,max_no_tokens,vector_size),dtype=K.floatx())
  for t in range(0,len(text_for_test_words)):
    if t >= max_no_tokens:
      break
    if text_for_test_words[t] not in w2v_model.words:
      continue
    
    x_text_for_test_words[0, t, :] = w2v_model.get_word_vector(text_for_test_words[t])
  # print(x_text_for_test_words.shape)
  # print(text_for_test_words)
  result = model.predict(x_text_for_test_words)
  pos_percent = str(int(result[0][1]*100))+" % "
  neg_percent = str(int(result[0][0]*100))+" % "
  display(HTML("<div style='text-align: center'><div style='display:inline-block'><img src='https://www.paralleldots.com/static/images/positive.png'/><h4>{}</h4></div> | <div style='display:inline-block'><img src='https://www.paralleldots.com/static/images/negative.png'/><h4>{}</h4></div></div>".format(pos_percent,neg_percent)))
else:
  print("Please enter your text")