In [26]:
from google.colab import drive
drive.mount('/content/drive')
path = '' # path to your training data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
!pip install fuzzywuzzy
!pip install python-Levenshtein



In [28]:
import csv
import tensorflow as tf
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from keras import Sequential
from keras.layers import Embedding, Bidirectional, Dense, LSTM
import nltk
from fuzzywuzzy import fuzz


In [29]:
# params
vocab_size = 50000
embedding_dim = 64
max_length = 50
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [30]:
# Classification labels 
POLARITY = ["POSITIVE", "NEUTRAL", "NEGATIVE"]

In [31]:
sentences = []
polarity_labels = []

# Read and process the training data
with open(path + "training_data.txt") as data:
    reader = csv.reader(data, delimiter="\t")
    for row in reader:        
        sentences.append(row[1])
        
        for (index, label) in enumerate(POLARITY):
            if fuzz.ratio(row[2].strip(), label) > 90:
                polarity_labels.append(index)           


In [32]:
def text_to_seq(data):
  tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
  tokenizer.fit_on_texts(data)
  data_sequences = tokenizer.texts_to_sequences(data)
  padded_sequences = pad_sequences(data_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
  return padded_sequences

In [33]:
train_padded = text_to_seq(sentences)
clipe_padded = text_to_seq(clipe_sentences)

In [34]:
def get_label_seq(labels):
  label_tokenizer = Tokenizer()
  label_tokenizer.fit_on_texts(labels)

  label_seq = np.array(label_tokenizer.texts_to_sequences(labels))
  return label_seq - 1


In [35]:
def get_model(number_of_labels):
  model = Sequential()
  model.add(Embedding(vocab_size, embedding_dim))
  model.add(Bidirectional(LSTM(embedding_dim)))
  model.add(Dense(embedding_dim, activation='relu'))
  model.add(Dense(number_of_labels, activation='softmax'))

  return model

### Sentiment Analysis using Bidirectional LSTM 

In [36]:
polarity_model = get_model(len(POLARITY))
polarity_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 64)          3200000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 195       
Total params: 3,274,499
Trainable params: 3,274,499
Non-trainable params: 0
_________________________________________________________________


In [37]:
polarity_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 4
polarit_history = polarity_model.fit(train_padded, np.array(polarity_labels), epochs=num_epochs, verbose=2)

Epoch 1/4
80/80 - 6s - loss: 0.9207 - accuracy: 0.4949
Epoch 2/4
80/80 - 6s - loss: 0.7236 - accuracy: 0.6559
Epoch 3/4
80/80 - 6s - loss: 0.3588 - accuracy: 0.8598
Epoch 4/4
80/80 - 6s - loss: 0.1593 - accuracy: 0.9465


In [38]:
loss, acc = polarity_model.evaluate(train_padded, np.array(polarity_labels))
print(acc * 100)

97.10937738418579


In [39]:
polarity_model.save(path + 'polarity/')

INFO:tensorflow:Assets written to: /content/drive/My Drive/NLPHW/PS2/polarity/assets
