In [0]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import random

In [0]:
random.seed(1000)

In [0]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer('[a-zA-Z0-9]\w+')

In [0]:
positive = []
negative = []

In [0]:
with open('pos.txt', 'r', buffering = 1000, encoding="ISO-8859–1") as p:
    positive = p.readlines()

In [0]:
with open('neg.txt', 'r', buffering = 1000, encoding="ISO-8859–1") as n:
    negative = n.readlines()

### As, generating labels for the whole 1600000 tweets is taking around 8hrs on Google Collab GPU, I had to reduce the dataset to a max of 400000 tweets.

In [0]:
positive = positive[:200000]
negative = negative[:200000]

In [19]:
print('Shuffling tweets to maintain randomness....')
unclean_tweets = list(positive) + list(negative)
random.shuffle(unclean_tweets)

Shuffling tweets to maintain randomness....


In [20]:
len(unclean_tweets)

400000

In [21]:
print('Generate labels...')
labels = []
with tqdm(total = len(unclean_tweets)) as pb:
    for tweet in unclean_tweets:
        if tweet in positive:
            labels.append(1)
        else:
            labels.append(0)
        pb.update(1)
del positive
del negative

  0%|          | 24/400000 [00:00<28:18, 235.46it/s]

Generate labels...


100%|██████████| 400000/400000 [23:05<00:00, 288.69it/s]


In [36]:
unclean_tweets[:5]

["There're some great Twitter apps for Mac. How much I miss my hackintosh installation.",
 '<< Right there Lyrics for Chocolate n Cream',
 'Does not want to go to school tomorrow',
 "Didn't get not one of your bbm's My phone sucks!",
 'yeah i do.']

In [37]:
!pip install tweet-preprocessor



In [0]:
# tweet preprocessor to eliminate emoji, url and mentions
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION)

###Cleaning tweets
1. Remove emoji
2. Remove URLs
3. Remove mentions

In [39]:
with tqdm(total = len(unclean_tweets)) as pb:
    for i in range(len(unclean_tweets)):
      tweet_unclean = unclean_tweets[i]
      unclean_tweets[i] = p.clean(tweet_unclean)
      pb.update(1)

100%|██████████| 400000/400000 [00:19<00:00, 20998.01it/s]


In [40]:
unclean_tweets[:5]

["There're some great Twitter apps for Mac. How much I miss my hackintosh installation.",
 '<< Right there Lyrics for Chocolate n Cream',
 'Does not want to go to school tomorrow',
 "Didn't get not one of your bbm's My phone sucks!",
 'yeah i do.']

### We see that there are certain tweets where the HTML encoding has not been converted into text. Eg: &amp, &quot
### So, here I will use BeautifulSoup

In [41]:
from bs4 import BeautifulSoup
sample = BeautifulSoup("lots'olaughs w/Katrina, Jackie, Sandra&amp;Angelo &quot;HOT DAMN!&quot;")
sample.get_text()

'lots\'olaughs w/Katrina, Jackie, Sandra&Angelo "HOT DAMN!"'

In [42]:
with tqdm(total = len(unclean_tweets)) as pb:
    for i in range(len(unclean_tweets)):
      tweet_unclean = unclean_tweets[i]
      unclean_tweets[i] = BeautifulSoup(tweet_unclean).get_text()
      pb.update(1)

  ' Beautiful Soup.' % markup)
100%|██████████| 400000/400000 [00:31<00:00, 12655.81it/s]


In [43]:
unclean_tweets[:5]

["There're some great Twitter apps for Mac. How much I miss my hackintosh installation.",
 '<< Right there Lyrics for Chocolate n Cream',
 'Does not want to go to school tomorrow',
 "Didn't get not one of your bbm's My phone sucks!",
 'yeah i do.']

### Tokenizing tweets

In [44]:
print('Tokenizing ..')
tweets = [tokenizer.tokenize(tweet.lower()) for tweet in unclean_tweets]

Tokenizing ..


In [45]:
tweets[:2]

[['there',
  're',
  'some',
  'great',
  'twitter',
  'apps',
  'for',
  'mac',
  'how',
  'much',
  'miss',
  'my',
  'hackintosh',
  'installation'],
 ['right', 'there', 'lyrics', 'for', 'chocolate', 'cream']]

### Lemmatizing

In [46]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [47]:
tweets = []
with tqdm(total=len(unclean_tweets)) as pb:
    for tweet in unclean_tweets:
        lemmatized = [lemmatizer.lemmatize(word) for word in tweet]
        tweets.append(lemmatized)
        pb.update(1)

100%|██████████| 400000/400000 [02:12<00:00, 3011.05it/s]


In [48]:
!pip install tensorflow-hub tensorflow numpy pickle tqdm keras

Collecting pickle
[31m  Could not find a version that satisfies the requirement pickle (from versions: )[0m
[31mNo matching distribution found for pickle[0m


### Universal Sentence Encoder for word embeddings

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/1")

tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as sess:
  sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
  tweet_embeddings = sess.run(embed(tweets))

In [0]:
import numpy as np

In [54]:
tweet_embeddings = np.array(tweet_embeddings)
tweet_embeddings.shape

NameError: ignored

In [0]:
tweet_embeddings = np.array([np.reshape(embed, (len(embed), 1)) for embed in tweet_embeddings])

In [0]:
tweet_embeddings.shape

### One-hot labels

In [0]:
from tqdm import tqdm

labels_one_hot = []

with tqdm(total=len(labels)) as pbar:
  for label in labels:
    if label == 0:
      labels_one_hot.append([1., 0.])
    else:
      labels_one_hot.append([0., 1.])
      
    pbar.update(1)

In [0]:
labels_one_hot = np.array(labels_one_hot)

### Pickling all data

In [0]:
import pickle

embeddings_file = "embeddings-{}.pickle".format(len(tweet_embeddings))
labels_file = "labels-{}.pickle".format(len(labels))

pickle.dump(tweet_embeddings, open(embeddings_file, 'wb'))
pickle.dump(labels_one_hot, open(labels_file, 'wb'))

In [0]:
labels_one_hot.shape

### Loading data

In [0]:
import pickle

tweet_embeddings = pickle.load(open('tweets_embeddings.pickle', 'rb'))
labels = pickle.load(open('labels-one_hot.pickle', 'rb'))

### Dataset partition

In [0]:
import numpy as np

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(tweet_embeddings, labels, test_size=.1)

In [0]:
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [0]:
vector_size = 512
batch_size = 500
no_epochs = 10

### Build the model

In [0]:
model.fit(x_train, y_train, batch_size=batch_size, shuffle=True, epochs=no_epochs,
         validation_data=(x_test, y_test), callbacks=[tensorboard, EarlyStopping(min_delta=0.0001, patience=3)])

In [0]:
from keras.models import Sequential
from keras.layers import Conv1D, Dropout, Dense, Flatten, LSTM, MaxPooling1D, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, TensorBoard

model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same',
                 input_shape=(vector_size, 1)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=3))

model.add(Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.3)))

model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(512, activation='sigmoid'))
model.add(Dropout(0.25))

model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001, decay=1e-6), metrics=['accuracy'])

tensorboard = TensorBoard(log_dir='logs/', histogram_freq=0, write_graph=True, write_images=True)

model.summary()

In [0]:
model.fit(np.array(x_train), np.array(y_train), batch_size=batch_size, epochs=no_epochs,
         validation_data=(np.array(x_test), np.array(y_test)), callbacks=[tensorboard, EarlyStopping(min_delta=0.0001, patience=3)])

### Model evaluation

In [0]:
model.metrics_names

In [0]:
model.evaluate(x=x_test, y=y_test, batch_size=500, verbose=1)

### Save the model

In [0]:
model.save('universal-sentence-encoder-400k.model')