# Prepatation

Let's first import the necessary standard libraries, load in the data from a csv file, and set some variables.

In [1]:
import os
import datetime
import numpy as np
import pandas as pd

In [2]:
# loading data from csv file

data_path = './data/tweets.csv'

data = pd.read_csv(data_path, usecols=[0,5], encoding='utf-8', names=['sentiments', 'tweets'])
data = data.sample(frac=.10)  # shuffle the tweets

data.sentiments = [0 if x==0 else 1 for x in data.sentiments]  # 
data[:10]



Unnamed: 0,sentiments,tweets
352686,1,@BellsCullen1901 hey i may not be right. dont ...
276608,1,follow jesszlatos here on twitter!!! please?
392841,1,FORGET FUN FUCK FEAR
367683,1,@CzarinaE i agree
138535,0,i hate my sleeping patterns .. i can never go ...
852,0,"@MonaSmith sadly, yes. i think i need councili..."
315240,1,HMV Shinjuku
306734,1,Office til around 6 today. Good day yesterday....
372272,1,Felt one of the longest (yet gentlest) earthqu...
144809,0,"@DianeMorgan00 lovely, but undressable I'm t..."


In [3]:
new_data_path = "./data/tweets_short.csv"

data.to_csv(new_data_path, header=True, index=False, encoding='utf-8')
data = pd.read_csv(new_data_path, encoding='utf-8')
data[:10]

Unnamed: 0,sentiments,tweets
0,1,@BellsCullen1901 hey i may not be right. dont ...
1,1,follow jesszlatos here on twitter!!! please?
2,1,FORGET FUN FUCK FEAR
3,1,@CzarinaE i agree
4,0,i hate my sleeping patterns .. i can never go ...
5,0,"@MonaSmith sadly, yes. i think i need councili..."
6,1,HMV Shinjuku
7,1,Office til around 6 today. Good day yesterday....
8,1,Felt one of the longest (yet gentlest) earthqu...
9,0,"@DianeMorgan00 lovely, but undressable I'm t..."


In [4]:
#setting some variables

vocab_size = 20000
maxlen = 42
batch_size = 32
embedding_dim = 25
epochs = 2

# TensorFlow 2.0 & Keras

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer


We'll use Keras' tokenizer for this example. The goal is to turn each tweet into an array of integers, with each integer representing one word of the vocabulary. In addition, we must pad shorter arrays so that they all have uniform length. That makes it easier to process them in batches.

In [6]:
# instatiate keras tokenizer
tokenizer = Tokenizer(vocab_size, oov_token='<00v>')
tokenizer.fit_on_texts(data.tweets)

sequences = tokenizer.texts_to_sequences(data.tweets)

In [7]:
# padding the sequences so that they have unoform dimension

padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)
padded.shape

(40000, 42)

In [8]:
X_data = padded
y_data = np.array(data.sentiments)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(dict(data))

for feature_batch in dataset.take(1):
    for key, value in feature_batch.items():
        print("  {!r:20s}: {}".format(key, value))
        

dataset = tf.data.experimental.make_csv_dataset(data_path, batch_size=32, header=False, select_columns=[0,5])

for feature_batch in dataset.take(1):
    for key, value in feature_batch.items():
        print("  {!r:20s}: {}".format(key, value))
        
dataset = dataset.map(lambda *items: tokenizer.texts_to_sequences(items))

The model is a simple two-layer neural network with an additional embedding layer. In Keras, the input dimension need not be determined, the model.compile() method is able to figure that out by itself.

In [20]:
# with Keras Sequential API

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, mask_zero=True),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 42, 25)            500000    
_________________________________________________________________
flatten_9 (Flatten)          (None, 1050)              0         
_________________________________________________________________
dense_18 (Dense)             (None, 128)               134528    
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 129       
Total params: 634,657
Trainable params: 634,657
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x=X_data, y=y_data, batch_size=batch_size, epochs=epochs, verbose=2, validation_split=0.2)

In [25]:
# with Keras' Subclassing API

class Subclass_Model(tf.keras.Model):
    
    def __init__(self, embedding_dim=25):
        
        super(Subclass_Model, self).__init__()
        self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, mask_zero=True)
        self.flatten_layer = tf.keras.layers.Flatten()
        self.fc1_layer =  tf.keras.layers.Dense(128, activation='relu')
        self.fc2_layer =  tf.keras.layers.Dense(1, activation='sigmoid')
        
    def call(self, inputs):
        
        x = self.embedding_layer(inputs)
        x = self.flatten_layer(x)
        x = self.fc1_layer(x)
        return self.fc2_layer(x)
        
model = Subclass_Model()

model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

model.build((42,))  # must be called in this case to print a summary
model.summary()

Model: "subclass__model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     multiple                  500000    
_________________________________________________________________
flatten_14 (Flatten)         multiple                  0         
_________________________________________________________________
dense_28 (Dense)             multiple                  3328      
_________________________________________________________________
dense_29 (Dense)             multiple                  129       
Total params: 503,457
Trainable params: 503,457
Non-trainable params: 0
_________________________________________________________________


In [13]:
# with Keras Functional API

inputs = tf.keras.layers.Input(shape=(42,))
x = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, mask_zero=True)(inputs)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 42)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 42, 25)            500000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1050)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               134528    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 634,657
Trainable params: 634,657
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x=X_data,
          y=y_data,
          batch_size=32,
          epochs=5,
          verbose=2,
          validation_split=0.2)

In [26]:
tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=False, show_layer_names=True, rankdir='TB', expand_nested=False, dpi=96)

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.


In [None]:
%%time

# fitting the model

model.fit(x=X_data, y=y_data, batch_size=batch_size, epochs=epochs, verbose=2, validation_split=0.2)

# PyTorch

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

In [None]:
#setting some variables

vocab_size = 20000
maxlen = 42
batch_size = 32
embedding_dim = 25
epochs = 2

In [None]:
X_data = torch.LongTensor(padded)
y_data = torch.FloatTensor(np.array(data.sentiments))

PyTorch does not have a fit method like TensorFlow, which is why you need to build the training loop yourself. This is less of a worry than it sounds.

You also need to create a way to process data in batches, which is best done by creating a custom PyTorch's Dataset and pass it to the DataLoader function.

In [None]:
class MyDataset(Dataset):
    
    def __init__(self, tweets, sentiments):
        self.tweets = tweets
        self.sentiments = sentiments
        
    def __getitem__(self, index):
        
        sample = {"tweets":self.tweets[index,:], "sentiments":self.sentiments[index]}
        
        return sample
        
    def __len__(self):
        return self.tweets.shape[0]
    
    
tweets_dataset = MyDataset(X_data, y_data)

dataloader = DataLoader(tweets_dataset, batch_size=batch_size,
                        shuffle=True)

Creating a neural network in PyTorch is very similar to TensorFlow. The nn.sequential method is helpful if you want to create a rather simple standard architecture. Unlike TensorFlow, you need to specify the input dimensions of each layer.

In [None]:
# with PyTorch nn.Sequential

model = nn.Sequential(
    nn.Embedding(vocab_size, embedding_dim),
    nn.Flatten(),
    nn.Linear(maxlen*embedding_dim, 128),
    nn.ReLU(),
    nn.Linear(128, 1),
    nn.Sigmoid()
)

In [None]:
# with PyTorch Subclassing

class Model(nn.Module):
    
    def __init__(self, embedding_dim=25):
        super(Model, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.flatten_layer = nn.Flatten()
        self.fc1_layer = nn.Linear(maxlen*embedding_dim, 128)
        self.fc2_layer = nn.Linear(128,1)
        
    
    def forward(self, inputs):
        
        x = self.embedding_layer(inputs)
        x = self.flatten_layer(x)
        x = F.relu(self.fc1_layer(x))
        
        return torch.sigmoid(self.fc2_layer(x))

In [None]:
def fit(model, dataloader, epochs=5):
    
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters())
        
    for epoch in range(epochs):
    
        epoch_loss = 0
        
        for sample in dataloader:

            X = sample["tweets"]
            y = sample["sentiments"]

            optimizer.zero_grad()

            out = model(X)
            loss = criterion(out, y)

            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        print(epoch_loss)

In [None]:
pytorch_model = Model()

fit(model, dataloader, epochs=5)

In [None]:
# model with CNN tbd

embedding_dim = 64
max_seq_len = 46
vocab_size = 20000
n_filters = 128
kernel_size = 5
batch_size = 32

model = nn.Sequential(
    nn.Embedding(vocab_size, embedding_dim),
    nn.Conv1d(max_seq_len, embedding_dim, kernel_size),
    nn.MaxPool1d(46), 
    nn.Linear(46, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1),
    nn.Sigmoid()
)