# Prepatation

In [253]:
import os
import datetime
import numpy as np
import pandas as pd

In [252]:
# loading data from csv file

data_path = './data/tweets.csv'

data = pd.read_csv(data_path, usecols=[0,5], encoding='utf-8', names=['sentiments', 'tweets'])
data = data.sample(frac=.10)

data.sentiments = [0 if x==0 else 1 for x in data.sentiments]
data[:20]

Unnamed: 0,sentiments,tweets
327537,1,@sandieb321 Nice to have some time to yoursel...
9581,0,@The_Tyree I miss the homie Remy!
345433,1,@FirstGentleman Okay we can agree on that...to...
38552,0,Good morning tweeples... I hope it is as brigh...
219419,1,@MajorDodson awwww...how sweet
170732,0,@Freebies4Mom will they have your video availa...
84703,0,School Tmz About to watch Rove ;)
104248,0,lost have to go on toll road
109078,0,@TraceyDukes hate cats
337607,1,@loretto1 i know! were like crazy smart to be ...


In [122]:
#setting some variables

ngram_range = 1
max_features = 20000
maxlen = 42
batch_size = 32
embedding_dims = 50
epochs = 2

# TensorFlow 2.0 & Keras

In [244]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow_datasets as tfds

In [245]:
# instatiating keras tokenizer

tokenizer = Tokenizer(vocab_size, oov_token='<00v>')
tokenizer.fit_on_texts(data.tweets)

sequences = tokenizer.texts_to_sequences(data.tweets)

In [246]:
# padding the sequences so that they have unoform dimension

padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)
padded.shape

(40000, 42)

In [247]:
X_data = padded
y_data = np.array(data.sentiments)

In [248]:
# configuring the model

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 42, 32)            640000    
_________________________________________________________________
flatten_7 (Flatten)          (None, 1344)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 24)                32280     
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 25        
Total params: 672,305
Trainable params: 672,305
Non-trainable params: 0
_________________________________________________________________


In [249]:
%%time

# fitting the model

model.fit(x=X_data, y=y_data, batch_size=batch_size, epochs=epochs, verbose=2, validation_split=0.2)

Train on 32000 samples, validate on 8000 samples
Epoch 1/2
32000/32000 - 13s - loss: 0.5436 - accuracy: 0.7160 - val_loss: 0.4717 - val_accuracy: 0.7754
Epoch 2/2
32000/32000 - 13s - loss: 0.3689 - accuracy: 0.8401 - val_loss: 0.4985 - val_accuracy: 0.7716
Wall time: 26.1 s


<tensorflow.python.keras.callbacks.History at 0x1a7c57c6408>

# PyTorch

In [236]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

In [237]:
X_data = torch.LongTensor(padded)
y_data = torch.FloatTensor(np.array(data.sentiments))

In [238]:
class MyDataset(Dataset):
    
    def __init__(self, tweets, sentiments):
        self.tweets = tweets
        self.sentiments = sentiments
        
    def __getitem__(self, index):
        
        sample = {"tweets":self.tweets[index,:], "sentiments":self.sentiments[index]}
        
        return sample
        
    def __len__(self):
        return self.tweets.shape[0]
    
tweets_dataset = MyDataset(X_data, y_data)

In [239]:
dataloader = DataLoader(tweets_dataset, batch_size=batch_size,
                        shuffle=True)

In [242]:
# configuring the model

model = nn.Sequential(
    nn.Embedding(vocab_size, embedding_dim),
    nn.Flatten(),
    nn.Linear(maxlen*batch_size, 128),
    nn.ReLU(),
    nn.Linear(128, 1),
    nn.Sigmoid()
)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

In [243]:
%%time

train_loss = 0

for epoch in range(epochs):
    
    for i, sample in enumerate(dataloader):
        
        X = sample["tweets"]
        y = sample["sentiments"]
                
        optimizer.zero_grad()
        
        out = model(X)
        loss = criterion(out, y)
        
        loss.backward()
        optimizer.step()
        
        if i%200 == 0:
            print(loss.item())
        

0.6882041096687317
0.6311270594596863
0.6742776036262512
0.5982884764671326
0.6217235326766968
0.6711956858634949
0.6582696437835693
0.5244784355163574
0.5245709419250488
0.546600341796875
0.42615628242492676
0.5278566479682922
0.5820406079292297
0.6533368825912476
Wall time: 46.6 s


In [30]:
# model with CNN tbd

embedding_dim = 64
max_seq_len = 46
vocab_size = 20000
n_filters = 128
kernel_size = 5
batch_size = 32

model = nn.Sequential(
    nn.Embedding(vocab_size, embedding_dim),
    nn.Conv1d(max_seq_len, embedding_dim, kernel_size),
    nn.MaxPool1d(46), 
    nn.Linear(46, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1),
    nn.Sigmoid()
)