In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
%cd drive
%cd MyDrive/
%cd ITCS_5156_Project

/content/drive
/content/drive/MyDrive
/content/drive/MyDrive/ITCS_5156_Project


# Imports

In [13]:
import pandas as pd
import nltk
nltk.download('punkt')
import string
import copy
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Loading/Preprocessing

In [4]:
#data load
text = pd.read_csv('dictionary.txt', sep='|', names=('sentence', 'phrase ids'))
sentiment = pd.read_csv('sentiment_labels.txt', sep='|')
print(text.head())
print(sentiment.head())

      sentence  phrase ids
0            !           0
1          ! '       22935
2         ! ''       18235
3       ! Alas      179257
4  ! Brilliant       22936
   phrase ids  sentiment values
0           0           0.50000
1           1           0.50000
2           2           0.44444
3           3           0.50000
4           4           0.42708


In [5]:
#join the sentiment to the corresponding phrase id
df = text.merge(right=sentiment, how='inner', on='phrase ids')
df.head()

Unnamed: 0,sentence,phrase ids,sentiment values
0,!,0,0.5
1,! ',22935,0.52778
2,! '',18235,0.5
3,! Alas,179257,0.44444
4,! Brilliant,22936,0.86111


In [6]:
#remove punctuation from sentences
df['preprocessed sentence'] = df['sentence'].apply(lambda x: "".join([char for char in x if char not in string.punctuation]))
df.head()

Unnamed: 0,sentence,phrase ids,sentiment values,preprocessed sentence
0,!,0,0.5,
1,! ',22935,0.52778,
2,! '',18235,0.5,
3,! Alas,179257,0.44444,Alas
4,! Brilliant,22936,0.86111,Brilliant


In [7]:
#drop blank sentences (or sentences that were previously entirely punctuation)
index_blanks = df[ (df['preprocessed sentence'] == '') | (df['preprocessed sentence'] == ' ')].index
df = df.drop(index_blanks)
df.head()

Unnamed: 0,sentence,phrase ids,sentiment values,preprocessed sentence
3,! Alas,179257,0.44444,Alas
4,! Brilliant,22936,0.86111,Brilliant
5,! Brilliant !,40532,0.93056,Brilliant
6,! Brilliant ! ',22937,1.0,Brilliant
7,! C'mon,60624,0.47222,Cmon


In [8]:
#convert to lower case
df['preprocessed sentence'] = df['preprocessed sentence'].str.lower()

In [9]:
#tokenize the sentences
df['preprocessed sentence'] = df['preprocessed sentence'].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,sentence,phrase ids,sentiment values,preprocessed sentence
3,! Alas,179257,0.44444,[alas]
4,! Brilliant,22936,0.86111,[brilliant]
5,! Brilliant !,40532,0.93056,[brilliant]
6,! Brilliant ! ',22937,1.0,[brilliant]
7,! C'mon,60624,0.47222,[cmon]


In [10]:
#Drop unnecessary columns
df = df.drop(['sentence', 'phrase ids'], axis=1)
df.head()

Unnamed: 0,sentiment values,preprocessed sentence
3,0.44444,[alas]
4,0.86111,[brilliant]
5,0.93056,[brilliant]
6,1.0,[brilliant]
7,0.47222,[cmon]


In [11]:
#Based on documentation from the original dataset, divide into 5 data classes
def sentiment_to_class(sentiment):
  if sentiment <= 0.2:
    return 1
  elif sentiment <= 0.4:
    return 2
  elif sentiment <= 0.6:
    return 3
  elif sentiment <= 0.8:
    return 4
  elif sentiment <= 1.0:
    return 5

df['sentiment class'] = df['sentiment values'].apply(sentiment_to_class)
df.head()

Unnamed: 0,sentiment values,preprocessed sentence,sentiment class
3,0.44444,[alas],3
4,0.86111,[brilliant],5
5,0.93056,[brilliant],5
6,1.0,[brilliant],5
7,0.47222,[cmon],3


# Topic Words Selection

In [None]:
#Based on equation 1 of "A Topic Matching based CNN for Sentence Classification"
#Select K most important words with respect to each class

def k_important_words(data, k, num_classes=5, smoothing_constant=1):
  #stores return value, a (num_classes * k) list of words
  words = []
  #stores word counts by class
  word_dict = {}
  #create a default list of 0s based on number of classes
  default_list = []
  for each in range(num_classes):
    default_list.append(0)
  
  #get number of occurrences of each word by class
  for class_num in range(num_classes):
    for sentence, class_val in zip(df['preprocessed sentence'], df['sentiment class']):
      #this condition prevents iterating over the same sentence multiple times
      if class_val == class_num:
        #go through each word in the sentence
        for word in sentence:
          #if word has been seen before
          if word in word_dict:
            #add 1 to instances associated with each class
            word_dict[word][class_num] += 1
          #if word has not been seen before
          else:
            #assign a list of 0s for each class, then add 1 to the current class
            word_dict[word] = copy.copy(default_list)
            word_dict[word][class_num] += 1

  #get total count of word instances associated with each class
  words_per_class = copy.copy(default_list)    
  for key in word_dict:
    for class_num in range(num_classes):
      #add the count of the word associated with that class to the total class word count
      words_per_class[class_num] += word_dict[key][class_num]

  #get total number of word instances
  total_word_instances = 0
  for each in words_per_class:
    total_word_instances += each

  #create a dictionary to store the Naive Bayes weights
  word_nb_vals = {}
  #calculate NB weights by class:
  for word in word_dict:
    weights = copy.copy(default_list)
    for class_num in range(num_classes):
      #numerator = number of word w in class c + smoothing / count of all words in class c
      nb_numerator = (word_dict[word][class_num] + smoothing_constant)/words_per_class[num_classes]
      #denominator = number of word w in other classes (all instances of word w - instances of w in class c) + smoothing / count of all words in other classes (total words - # words in class c)
      nb_denominator = (sum(word_dict[word]) - word_dict[word][class_num] + smoothing_constant)/(total_word_instances - words_per_class[num_classes])
      weights[class_num] = nb_numerator/nb_denominator
    #add weights to dictionary with key of the word
    word_nb_vals[word] = weights
  
  #find top k words for each class
  
