##### Replace slang words with actual words

In [0]:
from google.colab import files
files.upload()

Saving Untitled.ipynb to Untitled.ipynb
Saving neg.txt to neg.txt
Saving pos.txt to pos.txt
Saving slang.txt to slang.txt


In [0]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import random

In [0]:
random.seed(1000)

In [0]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer('[a-zA-Z0-9]\w+')

In [0]:
positive = []
negative = []

In [0]:
with open('pos.txt', 'r', buffering = 1000, encoding="ISO-8859–1") as p:
    positive = p.readlines()

In [0]:
with open('neg.txt', 'r', buffering = 1000, encoding="ISO-8859–1") as n:
    negative = n.readlines()

### As, generating labels for the whole 1600000 tweets is taking around 8hrs on Google Collab GPU, I had to reduce the dataset to a max of 400000 tweets.

In [0]:
positive = positive[:200000]
negative = negative[:200000]

In [0]:
print('Shuffling tweets to maintain randomness....')
unclean_tweets = list(positive) + list(negative)
random.shuffle(unclean_tweets)

Shuffling tweets to maintain randomness....


In [0]:
print('Generate labels...')
labels = []
with tqdm(total = len(unclean_tweets)) as pb:
    for tweet in unclean_tweets:
        if tweet in positive:
            labels.append(1)
        else:
            labels.append(0)
        pb.update(1)
del positive
del negative

  0%|          | 47/400000 [00:00<14:21, 463.99it/s]

Generate labels...


100%|██████████| 400000/400000 [13:41<00:00, 486.73it/s]


In [0]:
unclean_tweets[:5]

["I am falling in love with the Backstreet boys all over again!! I finally see what everyone loves about Nick  Damn he's hot\n",
 'who here wants to see a seventh hockey team in Canada? \n',
 "Finished registering for sw. today was fun in design. lots'olaughs w/Katrina, Jackie, Sandra&amp;Angelo &quot;HOT DAMN!&quot; Finishing drawing project \n",
 '@PeachPosh dont do it! \n',
 'neil Young is helpless  ? http://blip.fm/~6svev\n']

In [0]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading https://files.pythonhosted.org/packages/2a/f8/810ec35c31cca89bc4f1a02c14b042b9ec6c19dd21f7ef1876874ef069a6/tweet-preprocessor-0.5.0.tar.gz
Building wheels for collected packages: tweet-preprocessor
  Running setup.py bdist_wheel for tweet-preprocessor ... [?25l- done
[?25h  Stored in directory: /root/.cache/pip/wheels/1b/27/cc/49938e98a2470802ebdefae9d2b3f524768e970c1ebbe2dc4a
Successfully built tweet-preprocessor
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.5.0


In [0]:
# tweet preprocessor to eliminate emoji, url and mentions
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION)

## Cleaning tweets
### 1. Remove emoji
### 2. Remove URLs
### 3. Remove mentions

In [0]:
with tqdm(total = len(unclean_tweets)) as pb:
    for i in range(len(unclean_tweets)):
      tweet_unclean = unclean_tweets[i]
      unclean_tweets[i] = p.clean(tweet_unclean)
      pb.update(1)

100%|██████████| 400000/400000 [00:19<00:00, 20421.56it/s]


In [0]:
unclean_tweets[:5]

["I am falling in love with the Backstreet boys all over again!! I finally see what everyone loves about Nick Damn he's hot",
 'who here wants to see a seventh hockey team in Canada?',
 "Finished registering for sw. today was fun in design. lots'olaughs w/Katrina, Jackie, Sandra&amp;Angelo &quot;HOT DAMN!&quot; Finishing drawing project",
 'dont do it!',
 'neil Young is helpless ?']

### We see that there are certain tweets where the HTML encoding has not been converted into text. Eg: &amp, &quot
### So, here I will use BeautifulSoup

In [0]:
from bs4 import BeautifulSoup
sample = BeautifulSoup("lots'olaughs w/Katrina, Jackie, Sandra&amp;Angelo &quot;HOT DAMN!&quot;")
sample.get_text()

'lots\'olaughs w/Katrina, Jackie, Sandra&Angelo "HOT DAMN!"'

In [0]:
with tqdm(total = len(unclean_tweets)) as pb:
    for i in range(len(unclean_tweets)):
      tweet_unclean = unclean_tweets[i]
      unclean_tweets[i] = BeautifulSoup(tweet_unclean, 'lxml').get_text()
      pb.update(1)

  ' Beautiful Soup.' % markup)
100%|██████████| 400000/400000 [00:40<00:00, 9863.08it/s]


In [0]:
unclean_tweets[:5]

["I am falling in love with the Backstreet boys all over again!! I finally see what everyone loves about Nick Damn he's hot",
 'who here wants to see a seventh hockey team in Canada?',
 'Finished registering for sw. today was fun in design. lots\'olaughs w/Katrina, Jackie, Sandra&Angelo "HOT DAMN!" Finishing drawing project',
 'dont do it!',
 'neil Young is helpless ?']

### Tokenizing tweets

In [0]:
print('Tokenizing ..')
tweets = [tokenizer.tokenize(tweet.lower()) for tweet in unclean_tweets]

Tokenizing ..


In [0]:
tweets[:2]

[['am',
  'falling',
  'in',
  'love',
  'with',
  'the',
  'backstreet',
  'boys',
  'all',
  'over',
  'again',
  'finally',
  'see',
  'what',
  'everyone',
  'loves',
  'about',
  'nick',
  'damn',
  'he',
  'hot'],
 ['who',
  'here',
  'wants',
  'to',
  'see',
  'seventh',
  'hockey',
  'team',
  'in',
  'canada']]

### Lemmatizing

In [0]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
tweets = []
with tqdm(total=len(unclean_tweets)) as pb:
    for tweet in unclean_tweets:
        lemmatized = [lemmatizer.lemmatize(word) for word in tweet]
        tweets.append(lemmatized)
        pb.update(1)


100%|██████████| 400000/400000 [02:10<00:00, 3057.61it/s]


## Word2Vec algorithm for word embeddings

In [0]:
vector_size = 256
window = 5

In [0]:
!pip install gensim

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/27/a4/d10c0acc8528d838cda5eede0ee9c784caa598dbf40bd0911ff8d067a7eb/gensim-3.6.0-cp36-cp36m-manylinux1_x86_64.whl (23.6MB)
[K    100% |████████████████████████████████| 23.6MB 1.5MB/s 
Collecting smart-open>=1.2.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/4b/1f/6f27e3682124de63ac97a0a5876da6186de6c19410feab66c1543afab055/smart_open-1.7.1.tar.gz
Collecting boto>=2.32 (from smart-open>=1.2.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/23/10/c0b78c27298029e4454a472a1919bde20cb182dab1662cec7f2ca1dcc523/boto-2.49.0-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 15.5MB/s 
[?25hCollecting bz2file (from smart-open>=1.2.1->gensim)
  Downloading https://files.pythonhosted.org/packages/61/39/122222b5e85cd41c391b68a99ee296584b2a2d1d233e7ee32b4532384f2d/bz2file-0.98.tar.gz
Collecting boto3 (from smart-open>=1.2.1->gensim)
[?25l  Downlo

In [0]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

import time

word2vec_model = 'word2vec.model'

print('Generating Word2Vec Vectors ..')

start = time.time()

model = Word2Vec(sentences=tweets, size=vector_size, window=window, negative=20, iter=50, workers=4)

print('Word2Vec Created in {} seconds.'.format(time.time() - start))

model.save(word2vec_model)
print('Word2Vec Model saved at {}'.format(word2vec_model))

#Clear the memory!
del model

Generating Word2Vec Vectors ..
Word2Vec Created in 1022.3702681064606 seconds.
Word2Vec Model saved at word2vec.model


In [0]:
# load the saved model to store the vectors
model = Word2Vec.load('word2vec.model')
x_vectors = model.wv
del model

### Train and test split

In [0]:
!pip install keras



In [0]:
import numpy as np
import keras.backend as kb

train_size = int(0.9*(len(tweets)))
test_size = int(0.1*(len(tweets)))

max_no_tokens = 15

indexes = set(np.random.choice(len(tweets), train_size + test_size, replace=False))

x_train = np.zeros((train_size, max_no_tokens, vector_size), dtype=kb.floatx())
y_train = np.zeros((train_size, 2), dtype=np.int32)

x_test = np.zeros((test_size, max_no_tokens, vector_size), dtype=kb.floatx())
y_test = np.zeros((test_size, 2), dtype=np.int32)

Using TensorFlow backend.


In [0]:
for i, index in enumerate(indexes):
    for t, token in enumerate(tweets[index]):
        if t >= max_no_tokens:
            break
      
        if token not in x_vectors:
            continue
    
        if i < train_size:
            x_train[i, t, :] = x_vectors[token]
        else:
            x_test[i - train_size, t, :] = x_vectors[token]

  
    if i < train_size:
        y_train[i, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0]
    else:
        y_test[i - train_size, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0]
    
del tweets
del labels

In [0]:
x_train.shape,y_train.shape, x_test.shape, y_test.shape

((360000, 15, 256), (360000, 2), (40000, 15, 256), (40000, 2))

In [0]:
batch_size = 500
no_epochs = 100

In [0]:
from keras.models import Sequential
from keras.layers import Conv1D, Dropout, Dense, Flatten, LSTM, MaxPooling1D, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, TensorBoard


model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same',
                 input_shape=(max_no_tokens, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=3))

model.add(Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.3)))

model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.25))

model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001, decay=1e-6), metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='logs/', histogram_freq=0, write_graph=True, write_images=True)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 15, 32)            24608     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 15, 32)            3104      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 15, 32)            3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 5, 32)             0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1024)              2232320   
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
__________

In [0]:
model.fit(x_train, y_train, batch_size=batch_size, shuffle=True, epochs=no_epochs,
         validation_data=(x_test, y_test), callbacks=[tensorboard, EarlyStopping(min_delta=0.0001, patience=3)])

Train on 360000 samples, validate on 40000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


<keras.callbacks.History at 0x7fe42630e278>

## Model evaluation

In [0]:
model.metrics_names

['loss', 'acc']

In [0]:
model.evaluate(x=x_test, y=y_test, batch_size=32, verbose=1)



[0.5958784397602082, 0.673075]

In [0]:
model.save('word2vec-twitter-sentiment.model')

In [0]:
files.download('word2vec-twitter-sentiment.model')

----------------------------------------
Exception happened during processing of request from ('::ffff:127.0.0.1', 45998, 0, 0)
Traceback (most recent call last):
  File "/usr/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.6/socketserver.py", line 721, in __init__
    self.handle()
  File "/usr/lib/python3.6/http/server.py", line 418, in handle
    self.handle_one_request()
  File "/usr/lib/python3.6/http/server.py", line 406, in handle_one_request
    method()
  File "/usr/lib/python3.6/http/server.py", line 639, in do_GET
    self.copyfile(f, self.wfile)
  File "/usr/lib/python3.6/http/server.py", line 800, in copyfile
    shutil.copyfil

MessageError: ignored

# New Section

In [0]:
files.download('word2vec.model')