# TensorFlow 2.0

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow_datasets as tfds

import os
import datetime
import numpy as np
import pandas as pd

In [2]:
# loading data from csv file

data_path = './data/tweets.csv'

data = pd.read_csv(data_path, usecols=[0,5], encoding='utf-8', names=['sentiments', 'tweets'])
data.sentiments = [0 if x==0 else 1 for x in data.sentiments]

In [3]:
# instatiating keras tokenizer

vocab_size = 20000

tokenizer = Tokenizer(vocab_size, oov_token='<00v>')
tokenizer.fit_on_texts(data.tweets)

sequences = tokenizer.texts_to_sequences(data.tweets)

In [4]:
# padding the sequences so that they have unoform dimension

padded = tf.keras.preprocessing.sequence.pad_sequences(sequences)
padded.shape

(400000, 46)

In [85]:
# creat train and test sets

test_size = 30000

dataset = tf.data.Dataset.from_tensor_slices((padded, data.sentiments)).shuffle(padded.shape[0])
train_dataset = dataset.batch(32, drop_remainder=True).repeat(5)
#test_dataset = dataset.repeat(5).batch(32, drop_remainder=True)

train_generator = iter(train_dataset)
#test_generator = iter(test_dataset)
#dataset_generator = iter(dataset.batch(32))

In [86]:
train_dataset.repeat(4).batch(32)

<BatchDataset shapes: ((None, 32, 46), (None, 32)), types: (tf.int32, tf.int32)>

In [60]:
next(iter(dataset.batch(32)))

(<tf.Tensor: id=65050, shape=(32, 46), dtype=int32, numpy=
 array([[   0,    0,    0, ...,  193,  572,   50],
        [   0,    0,    0, ...,  108, 1223, 2614],
        [   0,    0,    0, ...,   18,   73,  517],
        ...,
        [   0,    0,    0, ..., 1433,    9, 1584],
        [   0,    0,    0, ...,  134,    1, 8353],
        [   0,    0,    0, ..., 2052, 3120,  247]])>,
 <tf.Tensor: id=65051, shape=(32,), dtype=int32, numpy=
 array([1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 1])>)

In [5]:
embedding_dim = 32
max_seq_len = 46

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_seq_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 46, 32)            640000    
_________________________________________________________________
flatten (Flatten)            (None, 1472)              0         
_________________________________________________________________
dense (Dense)                (None, 24)                35352     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 675,377
Trainable params: 675,377
Non-trainable params: 0
_________________________________________________________________


In [18]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

#model.fit_generator(train_generator, epochs=5, verbose=2, validation_data=test_generator,
#                    steps_per_epoch=100, validation_steps=50)

model.fit(x=padded, y=np.array(data.sentiments), batch_size=32, epochs=5, verbose=2)

Train on 400000 samples
Epoch 1/5


KeyboardInterrupt: 

In [None]:
# model with global average pooling 1d

embedding_dim = 32
max_seq_len = 46
vocab_size = 20000

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_seq_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(2, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
model.fit_generator(train_generator, steps_per_epoch=100, epochs=5, verbose=2, callbacks = [tensorboard_callback],
          validation_data=test_dataset)

In [None]:
# model with CNN

embedding_dim = 32
max_seq_len = 46
vocab_size = 20000
n_filters = 128
kernel_size = 5

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_seq_len),
    tf.keras.layers.Conv1D(n_filters, kernel_size, activation='relu', padding='same'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(2, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
model.fit(train_dataset, epochs=5, verbose=2, callbacks = [tensorboard_callback],
          validation_data=test_dataset)

In [72]:
# model with bidirectional GRU

embedding_dim = 32
max_seq_len = 46
vocab_size = 20000
n_filters = 128
kernel_size = 5

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_seq_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(embedding_dim)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 46, 32)            640000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                12672     
_________________________________________________________________
dense_18 (Dense)             (None, 24)                1560      
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 25        
Total params: 654,257
Trainable params: 654,257
Non-trainable params: 0
_________________________________________________________________


In [73]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

model.fit_generator(train_generator, epochs=5, verbose=2, validation_data=test_generator,
                    steps_per_epoch=100, validation_steps=50)

Epoch 1/5
100/100 - 28s - loss: 0.6814 - accuracy: 0.5794 - val_loss: 0.6100 - val_accuracy: 0.6762
Epoch 2/5
100/100 - 26s - loss: 0.5752 - accuracy: 0.7066 - val_loss: 0.5367 - val_accuracy: 0.7350
Epoch 3/5
100/100 - 30s - loss: 0.5241 - accuracy: 0.7469 - val_loss: 0.5455 - val_accuracy: 0.7337
Epoch 4/5
100/100 - 26s - loss: 0.5254 - accuracy: 0.7362 - val_loss: 0.4959 - val_accuracy: 0.7663
Epoch 5/5
100/100 - 27s - loss: 0.5191 - accuracy: 0.7472 - val_loss: 0.5096 - val_accuracy: 0.7613


<tensorflow.python.keras.callbacks.History at 0x191231ae4c8>

In [74]:
# model with two layers of bidirectional LSTM

embedding_dim1 = 64
embedding_dim2 = 32
max_seq_len = 46
vocab_size = 20000
n_filters = 128
kernel_size = 5

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_seq_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim1, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim2)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 46, 32)            640000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 46, 128)           49664     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_20 (Dense)             (None, 24)                1560      
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 25        
Total params: 732,465
Trainable params: 732,465
Non-trainable params: 0
_________________________________________________________________


In [88]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

model.fit_generator(train_generator, epochs=5, verbose=2, steps_per_epoch=100)

Epoch 1/5
100/100 - 42s - loss: 0.5564 - accuracy: 0.7216
Epoch 2/5
100/100 - 38s - loss: 0.5623 - accuracy: 0.7222
Epoch 3/5
100/100 - 38s - loss: 0.5441 - accuracy: 0.7284
Epoch 4/5
100/100 - 38s - loss: 0.5419 - accuracy: 0.7397
Epoch 5/5
100/100 - 38s - loss: 0.5311 - accuracy: 0.7375


<tensorflow.python.keras.callbacks.History at 0x1918de71588>

# PyTorch

In [5]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim

In [11]:
embedding_dim = 64
vocab_size = 20000
batch_size = 32
n_epochs = 5

model = nn.Sequential(
    nn.Embedding(vocab_size, embedding_dim),
    nn.Flatten(),
    nn.Linear(2944, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1),
    nn.Sigmoid()
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [12]:
X = torch.LongTensor(padded)
y = torch.LongTensor(data.sentiments)

In [44]:
X_batch = X[:32,:]
X_batch

X_batches = [X[:32*i,:] for i in range(1,X.shape[0]//batch_size)]
y_batches = [y[:32*i] for i in range(1,len(y)//batch_size)]

In [58]:
y==1

tensor([False, False, False,  ...,  True,  True,  True])

In [28]:
for epoch in n_epochs:
    for batch, labels in zip(X_batches, y_batches):
        
        optimizer.zero_grad()
        outpits = model(batch)
        loss = criterion(outputs, )

12500

In [30]:
# model with CNN

embedding_dim = 64
max_seq_len = 46
vocab_size = 20000
n_filters = 128
kernel_size = 5
batch_size = 32

model = nn.Sequential(
    nn.Embedding(vocab_size, embedding_dim),
    nn.Conv1d(max_seq_len, embedding_dim, kernel_size),
    nn.MaxPool1d(46), 
    nn.Linear(46, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1),
    nn.Sigmoid()
)