In [198]:
import gensim
import nltk
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

import csv
import re
import time

import numpy as np

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sakshamgandhi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sakshamgandhi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Import trained word2vec model into gensim, to later import into pytorch

In [199]:
start = time.time()
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)
end = time.time()

print("Took {} seconds to load word2vec model".format(end - start))

Took 92.15175294876099 seconds to load word2vec model


## Read word2vec vectors into embedding for pytorch 

In [200]:
weights = torch.FloatTensor(w2v_model.vectors)
w2v_embedding = nn.Embedding.from_pretrained(weights)

## Read CSV file and print for sanity check

In [201]:
df = pd.read_csv('data/nlp-getting-started/train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Important Helper Functions

In [202]:
def clean_sentence_helper(sentence, lemmatizer, regex_filter, stop_words):
    ''' 
        Helper function for preprocessing, cleans each sentence.  
        
        Input: 
            - sentence (string)
            - lemmatizer (object)
            - regex_filter (object)
            - stop_words (set)
            
        Returns: 
            - sentence_info_dict (dict)
                Keys: 
                    - clean_word_list (doesn't contain hashtags, hyperlinks, stopwords. Everything lemmatized)
                    - raw_word_list
                    - hashtag_count (for this sentence)
    '''
    sentence_info_dict = {}
    clean_word_list = []
    hashtag_count = 0
    
    sentence_info_dict['raw_word_list'] = sentence.split()
    
    sentence = sentence.lower()
    word_list = sentence.split()

    for word in word_list:
        # Remove hashtags
        if '#' in word:
            hashtag_count +=1
            continue     
        # Remove hyperlinks
        if 'http' in word:
            continue
        # Remove stop words
        if word in stop_words:
            continue
        # Remove non-characters from words
        word = regex_filter.sub('', word)
        if len(word) == 0:
            continue
        # Lemmatize words
        word = lemmatizer.lemmatize(word)
        clean_word_list.append(word)
    
    sentence_info_dict['clean_word_list'] = clean_word_list
    sentence_info_dict['hashtag_count'] = hashtag_count
    
    return sentence_info_dict


def generate_sentence_vector(sentence, w2v_model):
    ''' 
        Given a sentence, return a vector, by averaging word2vec vector for each word
        
        Input: 
            - sentence (string)
            - w2v_model (word2vec model, gensim)
        Return: 
            - sentence_vector
    '''
    sentence_vector = np.zeros((300,), dtype='float32')
    vector_count = 0 # number of vectors summed up 
    sentence_list = sentence.split()

    for word in sentence_list:
        if word in w2v_model.vocab:
            sentence_vector = sentence_vector + w2v_model[word]
            vector_count += 1
    
    # elementwise division of sentence vector (to turn it into an average)
    sentence_vector = sentence_vector / vector_count
    
    return sentence_vector


def prepare_sequence(sentence, word_vec_model):
    """ 
        Prep input for LSTM
        
        Input: 
            - sentence (string)
            - word_vec_model (gensim embedding)
        
        Returns: 
            - tensor of word indexes, representing index in word embedding

"""
    sentence_list = sentence.split()
    idxs = [word_vec_model.vocab[word].index for word in sentence if word in word_vec_model]
    if len(idxs) == 0:
        return None
    else:
        return torch.tensor(idxs, dtype=torch.long)

### Additional Helper Functions (this one might not work, but it's not being used so don't worry. Just for reference)

In [203]:
def check_list_for_nan(input_list):
    '''
        Helper function, check if list contains any nan values
        Input: 
            - input_list (list)
        Return: 
            - array_has_nan (boolean)
    '''
    input_np_array = np.asarray(input_list)
    array_sum = np.sum(input_np_array)
    array_has_nan = np.isnan(array_sum)
    
    return array_has_nan

## Preprocessing

In [204]:
# Extract info into lists
text = list(df['text'])
target = list(df['target'])
location = list(df['location'])

# Count of positive vs negative cases
pos = 0
neg = 0
for label in target:
    if label == 1:
        pos += 1
    else: 
        neg += 1
    
# Prep to gather tweet data
raw_length = 0
filtered_length = 0 #i.e. without hashtags
ht_count = 0
num_tweets_w_hts = 0

clean_text_list = []
vocab = set()

# Initialize lemmatizer, regex_filter, stopwords for preprocessing
lemmatizer = WordNetLemmatizer()
regex_filter = re.compile('[^a-zA-Z]')
stop_words = set(stopwords.words('english')) 

# Gather tweet data and preprocess text
for i, sentence in enumerate(text):
    
    sentence_info_dict = clean_sentence_helper(sentence, lemmatizer, regex_filter, stop_words)
    clean_word_list = sentence_info_dict['clean_word_list']
    hashtag_count = sentence_info_dict['hashtag_count']
    
    current_raw_length = len(clean_word_list)
    
    for word in clean_word_list:
        vocab.add(word)
    
    raw_length += len(sentence_info_dict['raw_word_list'])
    filtered_length += len(clean_word_list)
    ht_count += hashtag_count
    
    # Count how many tweets contain hashtags
    if hashtag_count > 0:
        num_tweets_w_hts += 1
    
    clean_sentence = " ".join(clean_word_list)
    clean_text_list.append(clean_sentence)

num_tweets = len(text)
ave_raw_length = raw_length / num_tweets
ave_filtered_length = filtered_length / num_tweets
ave_ht_count = ht_count / num_tweets  

print("preprocessing complete")


preprocessing complete


## Create output CSV with clean text and lablels

In [14]:
clean_df = pd.DataFrame(columns = ['text', 'label']) 
clean_df['text'] = clean_text_list
clean_df['label'] = target

clean_df.to_csv("output/clean_train_data.csv")
 
clean_df[:10]

Unnamed: 0,text,label
0,deed reason may allah forgive u,1
1,forest fire near la ronge sask canada,1
2,resident asked shelter place notified officer ...,1
3,people receive evacuation order california,1
4,got sent photo ruby smoke pours school,1
5,update california hwy closed direction due lak...,1
6,heavy rain cause flash flooding street manitou...,1
7,im top hill see fire wood,1
8,there emergency evacuation happening building ...,1
9,im afraid tornado coming area,1


### Create output CSV for BERT training

In [196]:
bert_df = pd.DataFrame(columns = ['review', 'sentiment']) 
bert_df['review'] = clean_text_list
bert_df['sentiment'] = target

bert_df.replace(to_replace = 1,value = "positive", inplace=True)
bert_df.replace(to_replace = 0,value = "negative", inplace=True)

bert_df.to_csv("output/bert_train_data.csv", index=False)

bert_df.reset_index(drop=True)
 
bert_df[:10]

Unnamed: 0,review,sentiment
0,deed reason may allah forgive u,positive
1,forest fire near la ronge sask canada,positive
2,resident asked shelter place notified officer ...,positive
3,people receive evacuation order california,positive
4,got sent photo ruby smoke pours school,positive
5,update california hwy closed direction due lak...,positive
6,heavy rain cause flash flooding street manitou...,positive
7,im top hill see fire wood,positive
8,there emergency evacuation happening building ...,positive
9,im afraid tornado coming area,positive


## Print out findings from data

In [205]:
print("num_tweets: {}".format(num_tweets))
print("num_tweets_w_hts: {}".format(num_tweets_w_hts))

print("\n")

print("Label distribution ...")
print("pos: {}".format(pos))
print("neg: {}".format(neg))

print("\n")

print("Fraction of tweets with hashtags: {}".format((num_tweets_w_hts/num_tweets)))
print("ave_raw_length: {}".format(ave_raw_length))
print("ave_filtered_length: {}".format(ave_filtered_length))
print("ave_ht_count: {}".format(ave_ht_count))

print("\n")

print("Vocabulary size (excluding hashtags): {}".format(len(vocab)))

num_tweets: 7613
num_tweets_w_hts: 1761


Label distribution ...
pos: 3271
neg: 4342


Fraction of tweets with hashtags: 0.2313148561670826
ave_raw_length: 14.903585971364771
ave_filtered_length: 8.803625377643504
ave_ht_count: 0.4445028241166426


Vocabulary size (excluding hashtags): 14404


### Generate vector representation for each sentence. Split into training and testing

In [17]:
tweet_vector_list = []
for i, tweet in enumerate(clean_text_list):
    sentence_vector = generate_sentence_vector(tweet, w2v_model)
    tweet_vector_list.append(sentence_vector)
    
x_train, x_test, y_train, y_test = train_test_split(tweet_vector_list, target, test_size=0.25, random_state=0)

x_train = np.nan_to_num(x_train)
x_test = np.nan_to_num(x_test)



### Run Baseline Model and Print Results

In [29]:
# Define Classifier
logistic_regr = LogisticRegression()

# Train Classifier and Test
logistic_regr.fit(x_train, y_train)
predictions = logistic_regr.predict(x_test)
score = logistic_regr.score(x_test, y_test)
print(score)

# Ensure these two numbers match
print(len(predictions))
print(len(y_test))

# from sklearn.naive_bayes import GaussianNB
# # clf = GaussianNB()

# from sklearn.svm import LinearSVC
# clf = LinearSVC(random_state=0, tol=1e-5)
# clf.fit(x_train, y_train)

# predictions = clf.predict(x_test)

correct = 0
predicted_pos = 0
true_pos = 0
true_neg = 0
false_pos = 0
false_neg = 0

# Count true pos, true neg, etc
for i in range(len(predictions)):
    if predictions[i] == y_test[i]:
        correct += 1
    if predictions[i] == 1:
        predicted_pos += 1
        if y_test[i] == 1:
            true_pos += 1
        else:
            false_pos += 1
    else:
        if y_test[i] == 1:
            false_neg += 1
        else:
            true_neg += 1
              
# Calculate Success Metrics
precision = true_pos / (true_pos + false_pos)
recall = true_pos / (true_pos + false_neg)
f1_score = 2 * (precision * recall) / (precision + recall)
            
# Sanity check
if (true_pos + true_neg + false_pos + false_neg) == len(y_test):
    print("sanity check makes sense!")
        
# Print info
print("accuracy: {}".format((correct / len(y_test))))
print("% predicted pos: {}".format((predicted_pos / len(y_test))))
print("precison: {}".format(precision))
print("recall: {}".format(recall))
print("f1_score: {}".format(f1_score))

0.7773109243697479
1904
1904
sanity check makes sense!
accuracy: 0.7773109243697479
% predicted pos: 0.3755252100840336
precison: 0.7608391608391608
recall: 0.6825595984943539
f1_score: 0.7195767195767196


### Define BiLSTM Model

In [206]:
class BiLSTMPOSTagger(nn.Module):
    # NOTE: you may have to modify these function headers to include your 
    # modification, e.g. adding a parameter for embeddings data

    def __init__(self, embedding_dim, hidden_dim, class_set_size, word_vec_embedding):
#     def __init__(self, embedding_dim, hidden_dim, class_set_size):
        super(BiLSTMPOSTagger, self).__init__()
        #############################################################################
        # TODO: Define and initialize anything needed for the forward pass.
        # You are required to create a model with:
        # an embedding layer: that maps words to the embedding space
        # a BiLSTM layer: that takes word embeddings as input and outputs hidden states
        # a Linear layer: maps from hidden state space to tag space
        #############################################################################
        
        self.embedding_layer = nn.Embedding()
#         self.lstm_layer = nn.LSTM(embedding_dim, 
#                                   hidden_dim, 
#                                   num_layers=LSTM_LAYERS,
#                                   bidirectional = True,
#                                   dropout=DROPOUT
#                                   )
#         self.embedding_layer = word_vec_embedding
        self.lstm_layer = nn.LSTM(embedding_dim, 
                                  hidden_dim, 
                                  num_layers=LSTM_LAYERS,
                                  bidirectional = False,
                                  dropout=DROPOUT
                                  )
        
        # Uncomment line below if using biLSTM instead of LSTM
#         self.linear_layer = nn.Linear(hidden_dim * 2, class_set_size)
        self.linear_layer = nn.Linear(hidden_dim, class_set_size)

        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################

    def forward(self, sentence):
        tag_scores = None
        #############################################################################
        # TODO: Implement the forward pass.
        # Given a tokenized index-mapped sentence as the argument, 
        # find the corresponding scores for tags
        # returns:: tag_scores (Tensor)
        #############################################################################
#         print("Printing sentence shape for debugging: {}".format(sentence.shape))
        embeddings = self.embedding_layer(sentence)
#         print("Print embeddings.shape: {}".format(embeddings.shape))
        embeddings = embeddings.view(len(sentence), 1, -1)
#         print("Printing embeddings shape for debugging: {}".format(embeddings.shape))
        # TODO: Check if we need the other 2 values returned by lstm layer
        output, (h, c) = self.lstm_layer(embeddings)
        
#         print("Exploring forward method")
#         print("output.shape: {}".format(output.shape))
#         print("h.shape: {}".format(h.shape))
#         print("c.shape: {}".format(c.shape))
        
        # tag_scores = self.linear_layer(tag_scores)
        class_scores = self.linear_layer(h.view(1, -1))
        
#         print("Print class_scores: {}".format(class_scores))

        # TODO: Check if this is needed
        # tag_scores = F.log_softmax(tag_scores, dim=1)
        class_scores = F.softmax(class_scores, dim=1)

        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
        return class_scores

### Train and Test Methods

In [209]:
def train(epoch, model, loss_function, optimizer, word_vec_model, training_data, val_data):
    train_loss = 0
    train_examples = 0
    for sentence, cls in training_data:
        #############################################################################
        # TODO: Implement the training loop
        # Hint: you can use the prepare_sequence method for creating index mappings 
        # for sentences. Find the gradient with respect to the loss and update the
        # model parameters using the optimizer.
        #############################################################################
        # Reset every iteration
        optimizer.zero_grad()
        
        # Convert to vectors
        word_vector_tensor = prepare_sequence(sentence, word_vec_model)
        
        # If word vector tensor is empty, skip this iteration. Move on to next sentence
        if word_vector_tensor == None:
            continue
#         true_tags_tensor = prepare_sequence(tags, word_vec_model)
        true_class_tensor = torch.tensor([cls], dtype=torch.int64)

        # Forward prop
        cls_prediction_tensor = model(word_vector_tensor)
#         predicted_cls_tensor = torch.argmax(cls_prediction_tensor, dim=1)

#         print("Printing cls_prediction_tensor: {}".format(cls_prediction_tensor))
#         print("Printing true_class_tensor: {}".format(true_class_tensor))

        # Backward Prop
#         print("cls: {}".format(cls))
#         print("cls_prediction_tensor.shape: {}".format(cls_prediction_tensor.shape))
#         print("true_class_tensor: {}".format(true_class_tensor))
#         print("cls_prediction_tensor: {}".format(cls_prediction_tensor))
#         print("predicted_cls_tensor: {}".format(predicted_cls_tensor))
#         print("predicted_cls_tensor.dtype: {}".format(predicted_cls_tensor.dtype))
        loss = loss_function(cls_prediction_tensor, true_class_tensor)
      
        # Update
        loss.backward()
        optimizer.step()

        train_loss += loss
#         train_examples += len(tags)
        train_examples += 1

        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
    avg_train_loss = None
    if train_examples > 0:
        avg_train_loss = train_loss / train_examples
    avg_val_loss, val_accuracy = evaluate(
                                    model, 
                                    loss_function, 
                                    optimizer,
                                    word_vec_model,
                                    val_data
                                )
    
    print("Epoch: {}/{}\tAvg Train Loss: {:.4f}\tAvg Val Loss: {:.4f}\t Val Accuracy: {:.0f}".format(epoch, 
                                                                      EPOCHS, 
                                                                      avg_train_loss, 
                                                                      avg_val_loss,
                                                                      val_accuracy))

def evaluate(model, loss_function, optimizer, word_vec_model, val_data):
  # returns:: avg_val_loss (float)
  # returns:: val_accuracy (float)
    val_loss = 0
    correct = 0
    val_examples = 0
    with torch.no_grad():
        for sentence, cls in val_data:
            #############################################################################
            # TODO: Implement the evaluate loop
            # Find the average validation loss along with the validation accuracy.
            # Hint: To find the accuracy, argmax of tag predictions can be used.
            #############################################################################
            
            # Convert to vectors
            word_vector_tensor = prepare_sequence(sentence, word_vec_model)
#             true_tags_tensor = prepare_sequence(tags, tag_to_idx)
            true_class_tensor = torch.tensor([cls], dtype=torch.int64)

            # Forward prop
            cls_prediction_tensor = model(word_vector_tensor)
            predicted_tags_tensor = torch.argmax(cls_prediction_tensor, dim=1)

            # Analysis
            loss = loss_function(cls_prediction_tensor, true_class_tensor)
            comparison_tensor = predicted_tags_tensor == true_class_tensor
            
            print("predicted_tags_tensor: {}".format(predicted_tags_tensor))
            print("true_class_tensor: {}".format(true_class_tensor))
            import time
            time.sleep(10000)

            val_loss += loss
#             val_examples += len(tags)
            val_examples += 1
            correct += torch.sum(comparison_tensor)

            #############################################################################
            #                             END OF YOUR CODE                              #
            #############################################################################
    val_accuracy = 100. * correct / val_examples
    avg_val_loss = val_loss / val_examples
    return avg_val_loss, val_accuracy


In [197]:
print("predicted_tags_tensor: {}".format(predicted_tags_tensor))
print("true_class_tensor: {}".format(true_class_tensor))

NameError: name 'predicted_tags_tensor' is not defined

### Prepare data for BiLSTM training

In [210]:
clean_text_list = clean_df['text'].tolist()
target = clean_df['label'].tolist()
    
x_train, x_test, y_train, y_test = train_test_split(clean_text_list, target, test_size=0.25, random_state=0)

# Return lists, not iterables. Otherwise need to run this cell over and over again
training_data = []
for x, y in zip(x_train, y_train):
    training_data.append((x, y))
    
val_data = []
for x, y in zip(x_test, y_test):
    val_data.append((x, y))

# x_train = np.nan_to_num(x_train)
# x_test = np.nan_to_num(x_test)

### Define Hyperparameters for BiLSTM

In [211]:
EMBEDDING_DIM = w2v_embedding.embedding_dim
HIDDEN_DIM = 15
# LEARNING_RATE = 0.005
LEARNING_RATE = 0.00005
LSTM_LAYERS = 1
DROPOUT = 0
EPOCHS = 30
CLASS_SET_SIZE = 2 # Only 2 classes of possible tweets
MOMENTUM = 0.9

### Initialize NN, optimizer and loss function object. Start training and evluating

In [212]:
#############################################################################
# TODO: Initialize the model, optimizer and the loss function
#############################################################################

model = BiLSTMPOSTagger(EMBEDDING_DIM, HIDDEN_DIM, CLASS_SET_SIZE, w2v_embedding)
# model = BiLSTMPOSTagger(EMBEDDING_DIM, HIDDEN_DIM, CLASS_SET_SIZE)
# optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

# TODO: Check which loss function to use, check if softmax required in forward()
loss_function = nn.NLLLoss()
# loss_function = F.cross_entropy


#############################################################################
#                             END OF YOUR CODE                              #
#############################################################################
for epoch in range(1, EPOCHS + 1): 
    train(epoch, model, loss_function, optimizer, w2v_model, training_data, val_data)

TypeError: __init__() missing 2 required positional arguments: 'num_embeddings' and 'embedding_dim'

### Cells from here onwards are for random testing. Treat as a Scratchpad

In [19]:
car_index = w2v_model.vocab['car'].index
print(car_index)

input = torch.LongTensor([car_index])
print(embedding(input))


# lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
# hello_embed = embeds(lookup_tensor)
# print(hello_embed)

385
tensor([[ 0.1309,  0.0084,  0.0334, -0.0588,  0.0400, -0.1426,  0.0493, -0.1689,
          0.2090,  0.1196,  0.1807, -0.2500, -0.1040, -0.1074, -0.0188,  0.0520,
         -0.0022,  0.0645,  0.1445, -0.0454,  0.1611, -0.0161, -0.0309,  0.0845,
          0.1621,  0.0447, -0.1553,  0.2539,  0.3398,  0.0076, -0.2559, -0.0173,
         -0.0330,  0.1631, -0.1260, -0.0991,  0.1650,  0.0688, -0.1895,  0.0283,
         -0.0535, -0.0306,  0.1108,  0.2412, -0.2344,  0.1235, -0.0029,  0.1484,
          0.3320,  0.0525, -0.2002,  0.3770,  0.1226,  0.1143, -0.1768,  0.1001,
          0.0030,  0.2676,  0.2012,  0.0371,  0.1108, -0.0981, -0.3125,  0.0352,
          0.0283,  0.2617, -0.0864, -0.0226, -0.0583, -0.0079,  0.1177, -0.0430,
         -0.1729,  0.0439, -0.2305,  0.1641, -0.1147, -0.0603,  0.0120, -0.2471,
          0.3262, -0.0449, -0.1143,  0.2285, -0.0165, -0.1504, -0.1318,  0.1260,
         -0.1748,  0.0221, -0.1016,  0.0082,  0.1079, -0.2461, -0.1094, -0.0938,
         -0.0162, -0.202

In [20]:
print(model['car'])

[ 0.13085938  0.00842285  0.03344727 -0.05883789  0.04003906 -0.14257812
  0.04931641 -0.16894531  0.20898438  0.11962891  0.18066406 -0.25
 -0.10400391 -0.10742188 -0.01879883  0.05200195 -0.00216675  0.06445312
  0.14453125 -0.04541016  0.16113281 -0.01611328 -0.03088379  0.08447266
  0.16210938  0.04467773 -0.15527344  0.25390625  0.33984375  0.00756836
 -0.25585938 -0.01733398 -0.03295898  0.16308594 -0.12597656 -0.09912109
  0.16503906  0.06884766 -0.18945312  0.02832031 -0.0534668  -0.03063965
  0.11083984  0.24121094 -0.234375    0.12353516 -0.00294495  0.1484375
  0.33203125  0.05249023 -0.20019531  0.37695312  0.12255859  0.11425781
 -0.17675781  0.10009766  0.0030365   0.26757812  0.20117188  0.03710938
  0.11083984 -0.09814453 -0.3125      0.03515625  0.02832031  0.26171875
 -0.08642578 -0.02258301 -0.05834961 -0.00787354  0.11767578 -0.04296875
 -0.17285156  0.04394531 -0.23046875  0.1640625  -0.11474609 -0.06030273
  0.01196289 -0.24707031  0.32617188 -0.04492188 -0.114257