# Tweet Analysis - W266 Final Project

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Install a few python packages using pip
from w266_common import utils
utils.require_package("wget")      # for fetching dataset

In [3]:
# Standard python helper libraries.
import os, sys, re, json, time
import itertools, collections
from importlib import reload
from IPython.display import display

# NumPy and SciPy for matrix ops
import numpy as np
import scipy.sparse

# NLTK for NLP utils
import nltk

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz

In [4]:
# Keras libraries
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras import models
from keras import layers
import keras

## Import Data

In [5]:
df = pd.read_csv("tweet_data.csv")
df.head()

Unnamed: 0,Emotion,Content,Original Content
0,disappointed,oh fuck did i wrote fil grinningfacewithsweat ...,b'RT @Davbingodav: @mcrackins Oh fuck.... did ...
1,disappointed,i feel nor am i shamed by it,i feel nor am i shamed by it
2,disappointed,i had been feeling a little bit defeated by th...,i had been feeling a little bit defeated by th...
3,happy,imagine if that reaction guy that called jj kf...,"b""@KSIOlajidebt imagine if that reaction guy t..."
4,disappointed,i wouldnt feel burdened so that i would live m...,i wouldnt feel burdened so that i would live m...


In [6]:
X = df.Content.to_numpy()
y = df.Emotion.to_numpy()

In [7]:
print(X.shape)
print(y.shape)

(916575,)
(916575,)


In [8]:
# Some starting variables
vocab_size = 10000
max_length = 40

In [9]:
# First split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Next split the train data into train and dev data
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [10]:
print("Train data shape:  {}".format(X_train.shape))
print("Dev data shape:    {}".format(X_dev.shape))
print("Test data shape:   {}".format(X_test.shape))
print("Train Label shape: {}".format(y_train.shape))
print("Dev label shape:   {}".format(y_dev.shape))
print("Test label shape:  {}".format(y_test.shape))


Train data shape:  (552694,)
Dev data shape:    (272223,)
Test data shape:   (91658,)
Train Label shape: (552694,)
Dev label shape:   (272223,)
Test label shape:  (91658,)


## Single Layer Perceptron

First, word embeddings. Will use default keras embeddings, i guess

In [11]:
# Tokenizing
tk = Tokenizer(num_words = vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n', lower=True, split = " ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=max_length)

X_dev_seq = tk.texts_to_sequences(X_dev)
X_dev_seq_trunc = pad_sequences(X_dev_seq, maxlen=max_length)

# Encoding output variable
le = LabelEncoder()

y_train_le = le.fit_transform(y_train)
y_train_emb = to_categorical(y_train_le)

y_dev_le = le.transform(y_dev)
y_dev_emb = to_categorical(y_dev_le)

In [21]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(vocab_size, 8, input_length=max_length, embeddings_regularizer='l1'))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(3, activation='relu'))
emb_model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['accuracy'])

In [14]:
emb_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 8)             80000     
_________________________________________________________________
flatten (Flatten)            (None, 320)               0         
_________________________________________________________________
dense (Dense)                (None, 3)                 963       
Total params: 80,963
Trainable params: 80,963
Non-trainable params: 0
_________________________________________________________________


In [23]:
emb_model.reset_states()
emb_model.fit(X_train_seq_trunc, y_train_emb, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x157b8da0f88>

In [15]:
results = emb_model.evaluate(X_dev_seq_trunc, y_dev_emb)
print("test loss, test acc:", results)

test loss, test acc: [nan, 0.32867172360420227]
