# Tweet Analysis - W266 Final Project

In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [33]:
# Install a few python packages using pip
from w266_common import utils
utils.require_package("wget")      # for fetching dataset

In [34]:
# Standard python helper libraries.
import os, sys, re, json, time
import itertools, collections
from importlib import reload
from IPython.display import display

# NumPy and SciPy for matrix ops
import numpy as np
import scipy.sparse

# NLTK for NLP utils
import nltk

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz

In [35]:
# Keras libraries
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras import models
from keras import layers
import keras

## Import Data

In [36]:
df = pd.read_csv("tweet_data.csv")
df.head()

Unnamed: 0,Emotion,Content,Original Content
0,disappointed,oh fuck did i wrote fil grinningfacewithsweat ...,b'RT @Davbingodav: @mcrackins Oh fuck.... did ...
1,disappointed,i feel nor am i shamed by it,i feel nor am i shamed by it
2,disappointed,i had been feeling a little bit defeated by th...,i had been feeling a little bit defeated by th...
3,happy,imagine if that reaction guy that called jj kf...,"b""@KSIOlajidebt imagine if that reaction guy t..."
4,disappointed,i wouldnt feel burdened so that i would live m...,i wouldnt feel burdened so that i would live m...


In [37]:
X = df['Original Content'].to_numpy()
y = df.Emotion.to_numpy()

In [38]:
print(X.shape)
print(y.shape)

(916575,)
(916575,)


In [39]:
# Some starting variables
vocab_size = 10000
max_length = 40

In [40]:
# First split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Next split the train data into train and dev data
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [41]:
print("Train data shape:  {}".format(X_train.shape))
print("Dev data shape:    {}".format(X_dev.shape))
print("Test data shape:   {}".format(X_test.shape))
print("Train Label shape: {}".format(y_train.shape))
print("Dev label shape:   {}".format(y_dev.shape))
print("Test label shape:  {}".format(y_test.shape))


Train data shape:  (552694,)
Dev data shape:    (272223,)
Test data shape:   (91658,)
Train Label shape: (552694,)
Dev label shape:   (272223,)
Test label shape:  (91658,)


In [42]:
# Tokenizing
tk = Tokenizer(num_words = vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n', lower=True, split = " ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=max_length)

X_dev_seq = tk.texts_to_sequences(X_dev)
X_dev_seq_trunc = pad_sequences(X_dev_seq, maxlen=max_length)

X_test_seq = tk.texts_to_sequences(X_test)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=max_length)

# Encoding output variable
le = LabelEncoder()

y_train_le = le.fit_transform(y_train)
y_train_emb = to_categorical(y_train_le)

y_dev_le = le.transform(y_dev)
y_dev_emb = to_categorical(y_dev_le)

y_test_le = le.transform(y_test)
y_test_emb = to_categorical(y_test_le)

In [43]:
# Use these for training!
X_train_final = X_train_seq_trunc
X_dev_final = X_dev_seq_trunc
X_test_final = X_test_seq_trunc

y_train_final = y_train_emb
y_dev_final = y_dev_emb
y_test_final = y_test_emb


## Single Layer Perceptron

First, word embeddings. Will use default keras embeddings, i guess

In [44]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(vocab_size, 8, input_length=max_length, embeddings_regularizer='l1'))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(3, activation='relu'))
emb_model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['accuracy'])

In [45]:
emb_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 320)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 963       
Total params: 80,963
Trainable params: 80,963
Non-trainable params: 0
_________________________________________________________________


In [46]:
emb_model.reset_states()
emb_model.fit(X_train_final, y_train_final, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2361c95c188>

In [47]:
results = emb_model.evaluate(X_dev_final, y_dev_final)
print("test loss, test acc:", results)

test loss, test acc: [nan, 0.32867172360420227]


# Original Text with BERT

In [49]:
import os
import sys
import tensorflow as tf
import io
import re

import pickle
from csv import reader
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

from tensorflow.keras import layers
from tensorflow.keras.backend import sparse_categorical_crossentropy
from tensorflow.keras.layers import Dense, Flatten

from datetime import datetime

In [51]:
tf.get_logger().setLevel("ERROR") 
from transformers import BertTokenizer, TFBertModel

In [52]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [53]:
def addWord(word, pos, ner):
    """
    Convert a word into a word token and add supplied NER and POS labels. Note that the word can be  
    tokenized to two or more tokens. Correspondingly, we add - for now - custom 'X' tokens to the labels in order to 
    maintain the 1:1 mappings between word tokens and labels.
    
    arguments: word, pos label, ner label
    returns: dictionary with tokens and labels
    """
    # the dataset contains various '"""' combinations which we choose to truncate to '"', etc. 
    if word == '""""':
        word = '"'
    elif word == '``':
        word = '`'
        
    tokens = tokenizer.tokenize(word)
    tokenLength = len(tokens)      # find number of tokens corresponfing to word to later add 'X' tokens to labels
    
    addDict = dict()
    
    addDict['wordToken'] = tokens
    addDict['posToken'] = [pos] + ['posX'] * (tokenLength - 1)
    addDict['nerToken'] = [ner] + ['nerX'] * (tokenLength - 1)
    addDict['tokenLength'] = tokenLength
    
    
    return addDict


In [56]:
"""
Read the file line by line and construct sentences. A sentence end is marked by the word 'sentence' in the next row.
You need to take care of that. Also, you need to cap sentence length using max_length. Sentences which are shorter than 
max_length need to be padded. Also, we choose to end all sentences with a [SEP] token, padded or not. 
"""

with io.open('tweet_data.csv', 'r', encoding='utf-8', errors='ignore') as train:
    text = train.readlines()


# lists for sentences, tokens, labels, etc.  
sentenceList = []
sentenceTokenList = []
posTokenList = []
nerTokenList = []
sentLengthList = []

# lists for BERT input
bertSentenceIDs = []
bertMasks = []
bertSequenceIDs = []

sentence = ''

# always start with [CLS] tokens
sentenceTokens = ['[CLS]']
posTokens = ['[posCLS]']
nerTokens = ['[nerCLS]']

for line in text:
    
    cleanLine = re.sub(r'(?!(([^"]*"){2})*[^"]*$),', '', line)  # deal with '"10,000"' and convert them to '10000' 

    sent, word, pos = cleanLine.split(',')
    
    ner = ner[:-1]   # remove DOS token
    
    # if new sentence starts
    if (sent[:8] == 'Sentence'):            
            
        sentenceLength = min(max_length -1, len(sentenceTokens))
        sentLengthList.append(sentenceLength)
        
                    
        # Create space for at least a final '[SEP]' token
        if sentenceLength >= max_length - 1: 
            sentenceTokens = sentenceTokens[:max_length - 2]
            posTokens = posTokens[:max_length - 2]
            nerTokens = nerTokens[:max_length - 2]

        # add a ['SEP'] token and padding
        
        sentenceTokens += ['[SEP]'] + ['[PAD]'] * (max_length -1 - len(sentenceTokens))
        
        posTokens += ['[posSEP]'] + ['[posPAD]'] * (max_length - 1 - len(posTokens) )
        nerTokens += ['[nerSEP]'] + ['[nerPAD]'] * (max_length - 1 - len(nerTokens) )
            
        sentenceList.append(sentence)

        sentenceTokenList.append(sentenceTokens)

        bertSentenceIDs.append(tokenizer.convert_tokens_to_ids(sentenceTokens))
        bertMasks.append([1] * (sentenceLength + 1) + [0] * (max_length -1 - sentenceLength ))
        bertSequenceIDs.append([0] * (max_length))
                             
        posTokenList.append(posTokens)
        nerTokenList.append(nerTokens)
        
        sentence = ''
        sentenceTokens = ['[CLS]']
        posTokens = ['[posCLS]']
        nerTokens = ['[nerCLS]']
        
        sentence += ' ' + word

    addDict = addWord(word, pos, ner)

    sentenceTokens += addDict['wordToken']
    posTokens += addDict['posToken']
    nerTokens += addDict['nerToken']

# The first two list elements need to be removed. 1st line in file is a-typical, and 2nd line does not end a sentence   
sentLengthList = sentLengthList[2:]
sentenceTokenList = sentenceTokenList[2:]
bertSentenceIDs = bertSentenceIDs[2:]
bertMasks = bertMasks[2:]
bertSequenceIDs = bertSequenceIDs[2:]
posTokenList = posTokenList[2:]
nerTokenList = nerTokenList[2:]

NameError: name 'ner' is not defined