In [1]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/NLPHW/PS2/'

Mounted at /content/drive


In [3]:
import csv
import tensorflow as tf
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [4]:
# params
vocab_size = 50000
embedding_dim = 64
max_length = 50
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [6]:
# Classification labels 

POLARITY = ["POSITIVE", "NEUTRAL", "NEGATIVE"]

In [9]:
# Convert the text into sequences of words and pad the sequences
def text_to_seq(data):
  tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
  tokenizer.fit_on_texts(data)
  data_sequences = tokenizer.texts_to_sequences(data)
  padded_sequences = pad_sequences(data_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
  return padded_sequences

In [10]:
sentences = []
polarity_label = []

# Open the test file for predictions
with open(path + "PS2_training_data.txt") as data:
    reader = csv.reader(data, delimiter="\t")
    for row in reader:
        sentences.append(row[1])
        polarity_label.append(row[2].strip())


In [11]:
padded_sentences = text_to_seq(sentences)

In [12]:
# Load the Trained model
polarity_classifier = tf.keras.models.load_model(path + "polarity");

In [13]:
def classify(sentence):
  polarity = polarity_classifier.predict(sentence)

  polarity = [POLARITY[i] for i in list(map(lambda pol: np.argmax(pol), polarity))]

  return polarity

In [14]:
# Classify each sentence and write it to the output file
with open(path + "output.txt", "w") as output:
    polarity = classify(padded_sentences)
    for index in range(len(sentences)):
        # Write to the file
        output.write(str(index))
        output.write("\t")
        output.write(sentences[index])
        output.write("\t")

        output.write(polarity[index])
        output.write("\t")


### Analyze the performance of the model for each task


In [15]:
def statistics(actual_list, predicted_list):
    #Confusion Metrics
    print("Confusion Metrics: ")
    print(confusion_matrix(actual_list, predicted_list))

    #Classificatio Report
    print("Classification Report: ")
    print(classification_report(actual_list, predicted_list))
    
    print()
    
    #Accuracy score
    print("Accuracy Score ", (accuracy_score(actual_list, predicted_list)) * 100)

In [None]:
# Print the statistics for Task 2
statistics(polarity_label, polarity)

Confusion Metrics: 
[[1274    0    8]
 [  18  176   10]
 [  28    0 1046]]
Classification Report: 
              precision    recall  f1-score   support

    NEGATIVE       0.97      0.99      0.98      1282
     NEUTRAL       1.00      0.86      0.93       204
    POSITIVE       0.98      0.97      0.98      1074

    accuracy                           0.97      2560
   macro avg       0.98      0.94      0.96      2560
weighted avg       0.98      0.97      0.97      2560


Accuracy Score  97.5
