In [14]:
from google.colab import files
files.upload()

Saving pos.txt to pos.txt


In [0]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import random

In [0]:
random.seed(1000)

In [0]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer('[a-zA-Z0-9]\w+')

In [0]:
positive = []
negative = []

In [0]:
with open('pos.txt', 'r', buffering = 1000, encoding="ISO-8859–1") as p:
    positive = p.readlines()

In [0]:
with open('neg.txt', 'r', buffering = 1000, encoding="ISO-8859–1") as n:
    negative = n.readlines()

### As, generating labels for the whole 1600000 tweets is taking around 8hrs on Google Collab GPU, I had to reduce the dataset to a max of 400000 tweets.

In [0]:
positive = positive[:200000]
negative = negative[:200000]

In [14]:
print('Shuffling tweets to maintain randomness....')
unclean_tweets = list(positive) + list(negative)
random.shuffle(unclean_tweets)

Shuffling tweets to maintain randomness....


In [62]:
len(unclean_tweets)

400000

In [15]:
print('Generate labels...')
labels = []
with tqdm(total = len(unclean_tweets)) as pb:
    for tweet in unclean_tweets:
        if tweet in positive:
            labels.append(1)
        else:
            labels.append(0)
        pb.update(1)
del positive
del negative

  0%|          | 56/400000 [00:00<12:03, 552.90it/s]

Generate labels...


100%|██████████| 400000/400000 [14:24<00:00, 462.83it/s]


In [16]:
unclean_tweets[:5]

['@GoldyMom oh we are near the sawgrass mills mall area \n',
 "I don't want to apply for jobs!!!  \n",
 "@butterflykate Who've u noticed now Kate ? \n",
 'The furry ones and I were gonna go back 2 the bark park today. But its ElCrapo outside \n',
 "@leenkwan haha edz is too lame d i dunno wat to say. im so tempted to get another bunny! they're too cute! go check ur email babe \n"]

In [17]:
!pip install tweet-preprocessor



In [0]:
# tweet preprocessor to eliminate emoji, url and mentions
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION)

###Cleaning tweets
1. Remove emoji
2. Remove URLs
3. Remove mentions

In [19]:
with tqdm(total = len(unclean_tweets)) as pb:
    for i in range(len(unclean_tweets)):
      tweet_unclean = unclean_tweets[i]
      unclean_tweets[i] = p.clean(tweet_unclean)
      pb.update(1)

100%|██████████| 400000/400000 [00:20<00:00, 19766.90it/s]


In [20]:
unclean_tweets[:5]

['oh we are near the sawgrass mills mall area',
 "I don't want to apply for jobs!!!",
 "Who've u noticed now Kate ?",
 'The furry ones and I were gonna go back 2 the bark park today. But its ElCrapo outside',
 "haha edz is too lame d i dunno wat to say. im so tempted to get another bunny! they're too cute! go check ur email babe"]

### We see that there are certain tweets where the HTML encoding has not been converted into text. Eg: &amp, &quot
### So, here I will use BeautifulSoup

In [21]:
from bs4 import BeautifulSoup
sample = BeautifulSoup("lots'olaughs w/Katrina, Jackie, Sandra&amp;Angelo &quot;HOT DAMN!&quot;")
sample.get_text()

'lots\'olaughs w/Katrina, Jackie, Sandra&Angelo "HOT DAMN!"'

In [22]:
with tqdm(total = len(unclean_tweets)) as pb:
    for i in range(len(unclean_tweets)):
      tweet_unclean = unclean_tweets[i]
      unclean_tweets[i] = BeautifulSoup(tweet_unclean).get_text()
      pb.update(1)

  ' Beautiful Soup.' % markup)
100%|██████████| 400000/400000 [00:41<00:00, 9593.77it/s]


In [23]:
unclean_tweets[:5]

['oh we are near the sawgrass mills mall area',
 "I don't want to apply for jobs!!!",
 "Who've u noticed now Kate ?",
 'The furry ones and I were gonna go back 2 the bark park today. But its ElCrapo outside',
 "haha edz is too lame d i dunno wat to say. im so tempted to get another bunny! they're too cute! go check ur email babe"]

### Tokenizing tweets

In [26]:
print('Tokenizing ..')
tweets = [tokenizer.tokenize(tweet.lower()) for tweet in unclean_tweets]

Tokenizing ..


In [27]:
tweets[:2]

[['oh', 'we', 'are', 'near', 'the', 'sawgrass', 'mills', 'mall', 'area'],
 ['don', 'want', 'to', 'apply', 'for', 'jobs']]

### Lemmatizing

In [28]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [29]:
tweets = []
with tqdm(total=len(unclean_tweets)) as pb:
    for tweet in unclean_tweets:
        lemmatized = [lemmatizer.lemmatize(word) for word in tweet]
        tweets.append(lemmatized)
        pb.update(1)

100%|██████████| 400000/400000 [02:11<00:00, 3043.21it/s]


### FastText (Gensim) for word embeddings

In [0]:
vector_size = 256
window = 5

In [31]:
!pip install gensim



In [32]:
from gensim.models import FastText

import time

fasttext_model = 'fasttext.model'

print('Generating FastText Vectors ..')

start = time.time()

model = FastText(size=vector_size)
model.build_vocab(tweets)
model.train(tweets, window=window, min_count=1, workers=4, total_examples=model.corpus_count,
           epochs=model.epochs)

print('FastText Created in {} seconds.'.format(time.time() - start))

model.save(fasttext_model)
print('FastText Model saved at {}'.format(fasttext_model))

del model

Generating FastText Vectors ..
FastText Created in 75.7538697719574 seconds.
FastText Model saved at fasttext.model


In [0]:
model = FastText.load(fasttext_model)

In [0]:
x_vectors = model.wv
del model

### Test and train spilt

In [35]:
len(tweets)

400000

In [36]:
import numpy as np
import keras.backend as K

train_size = int(0.9*(len(tweets)))
test_size = int(0.1*(len(tweets)))

max_no_tokens = 15

indexes = set(np.random.choice(len(tweets), train_size + test_size, replace=False))

x_train = np.zeros((train_size, max_no_tokens, vector_size), dtype=K.floatx())
y_train = np.zeros((train_size, 2), dtype=np.int32)

x_test = np.zeros((test_size, max_no_tokens, vector_size), dtype=K.floatx())
y_test = np.zeros((test_size, 2), dtype=np.int32)

Using TensorFlow backend.


In [0]:
for i, index in enumerate(indexes):
    for t, token in enumerate(tweets[index]):
        if t >= max_no_tokens:
            break
      
        if token not in x_vectors:
            continue
    
        if i < train_size:
            x_train[i, t, :] = x_vectors[token]
        else:
            x_test[i - train_size, t, :] = x_vectors[token]

  
    if i < train_size:
        y_train[i, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0]
    else:
        y_test[i - train_size, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0]
    
del tweets
del labels

In [38]:
x_train.shape, y_test.shape

((360000, 15, 256), (40000, 2))

### Training the model

In [0]:
batch_size = 500
no_epochs = 100

In [40]:
from keras.models import Sequential
from keras.layers import Conv1D, Dropout, Dense, Flatten, LSTM, MaxPooling1D, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, TensorBoard


model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same',
                 input_shape=(max_no_tokens, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=3))

model.add(Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.3)))

model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.25))

model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001, decay=1e-6), metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='logs/', histogram_freq=0, write_graph=True, write_images=True)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 15, 32)            24608     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 15, 32)            3104      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 15, 32)            3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 5, 32)             0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1024)              2232320   
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
__________

In [41]:
model.fit(x_train, y_train, batch_size=batch_size, shuffle=True, epochs=no_epochs,
         validation_data=(x_test, y_test), callbacks=[tensorboard, EarlyStopping(min_delta=0.0001, patience=3)])

Train on 360000 samples, validate on 40000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100


<keras.callbacks.History at 0x7f8500f1be48>

### Model evaluation

In [42]:
model.metrics_names

['loss', 'acc']

In [43]:
model.evaluate(x=x_test, y=y_test, batch_size=32, verbose=1)



[0.5978790123224258, 0.671175]

### Save the model

In [0]:
model.save('twitter-sentiment-fasttext.model')