### Scraping tweets and saving them as txt files

In [0]:
pip install getoldtweets3



In [0]:
#https://towardsdatascience.com/how-to-scrape-tweets-from-twitter-59287e20f0f1
import GetOldTweets3 as got
def scrape_tweets(username):
  username = username
  count = 2000
  # Creation of query object
  tweetCriteria = got.manager.TweetCriteria().setUsername(username)\
                                          .setMaxTweets(count)
  # Creation of list that contains all tweets
  tweets = got.manager.TweetManager.getTweets(tweetCriteria)
  # Creating list of chosen tweet data
  user_tweets = [[tweet.text] for tweet in tweets]
  return user_tweets

In [0]:
# Use above function to return tweets of Trump and Obama.
# If there is certificate error, try again at another time, maybe several minutes later.
# Your code here:
obama_tweets=scrape_tweets('BarackObama')
trump_tweets=scrape_tweets('realDonaldTrump')

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Save all tweets to two folders
# Update the code with your own variables and paths.
# The resulted files will be in two folders just like IMDB data.
for i in range(0,len(trump_tweets)):
  path='/content/drive/My Drive/Obama vs Trump Tweets classification/Trump/trump'+str(i)+'.txt'
  with open(path, "w") as output:
    output.write(str(trump_tweets[i]))
for i in range(0,len(obama_tweets)):
  path='/content/drive/My Drive/Obama vs Trump Tweets classification/Obama/obama'+str(i)+'.txt'
  with open(path, "w") as output:
    output.write(str(obama_tweets[i]))

# Loading data and prepare data for modeling

In [0]:
#Load the data you just saved from your drive
import os

as3_dir = '/content/drive/My Drive/Obama vs Trump Tweets classification'

labels = []
texts = []
count = 0
for label_type in ['Obama/', 'Trump/']:
    dir_name = os.path.join(as3_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            count = count + 1
            print(count)
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'Obama/':
                labels.append(0)
            else:
                labels.append(1)

In [0]:
# Tokenize, pad and prepare training and validation data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 140  # cutting tweets after 140 words
training_samples = 2000  # Training using 2000 samples
validation_samples = 2000  # Validating on 2000 samples
max_words = 5000  # Considering the top 10,000 words in the dataset

# This class allows to vectorize a text corpus, by turning each text into either a sequence of integers
# omits common characters 
tokenizer = Tokenizer(num_words=max_words)

# fit_on_texts means it learns the indices of the words
tokenizer.fit_on_texts(texts)

# the sequences are comprised of those indices
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# in case of low word counts, we need to pad sequences
# so that they are uniform length
data = pad_sequences(sequences, maxlen=maxlen)

# our labels, which were previously stored as a list [],
# are now converted to a numpy array for modeling
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples] # from 0 to (2000) training samples
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Using TensorFlow backend.


Found 10172 unique tokens.
Shape of data tensor: (4000, 140)
Shape of label tensor: (4000,)


In [0]:
#Downloaded GloVe word embeddings into my gdrive from https://nlp.stanford.edu/projects/glove/
glove_dir = '/content/drive/My Drive/GLoVE embeddings/'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_dim = 100 # this is the dimension of the embeddings file we imported

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


## Fitting Models

In [0]:
import tensorflow as tf
from sklearn.metrics import classification_report

# LSTM model with no pre-trained embeddings 
model1 = tf.keras.models.Sequential()
model1.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=maxlen))
model1.add(tf.keras.layers.LSTM(100))
model1.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model1.summary()

model1.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model1.fit(x_train, y_train,
                    epochs=5,
                    batch_size=50,
                    validation_data=(x_val, y_val))
#model1.save_weights('predicting_Obama_vs_Trump_tweets_model1.h5')
model1.save('predicting_Obama_vs_Trump_tweets_model1.h5')


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 140, 100)          1000000   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
#model1=load_model('predicting_Obama_vs_Trump_tweets_model1.h5')

#y_pred = model1.predict(x_val, verbose=0)
y_pred = model1.predict_classes(x_val, verbose=0)
print(classification_report(y_val, y_pred))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       987
           1       0.96      0.97      0.96      1013

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



In [0]:
# Bi-directional LSTM with no pre-trained embeddings
model2 = tf.keras.models.Sequential()
model2.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=maxlen))
model2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)))
model2.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model2.summary()

model2.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model2.fit(x_train, y_train,
                    epochs=5,
                    batch_size=50,
                    validation_data=(x_val, y_val))
model2.save('predicting_Obama_vs_Trump_tweets_model2.h5')

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 140, 100)          1000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               160800    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 1,161,001
Trainable params: 1,161,001
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
target_names = ['Obama', 'Trump']
y_pred = model2.predict_classes(x_val, verbose=0)
print(classification_report(y_val, y_pred, target_names=target_names))

              precision    recall  f1-score   support

       Obama       0.95      0.96      0.96       987
       Trump       0.96      0.96      0.96      1013

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



In [0]:
# 1D convolution then LSTM with no pre-trained embeddings
model3 = tf.keras.models.Sequential()
model3.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=maxlen))
model3.add(tf.keras.layers.Convolution1D(filters=32, kernel_size=3,activation='relu'))
model3.add(tf.keras.layers.MaxPooling1D(pool_size=2))
model3.add(tf.keras.layers.LSTM(100))
model3.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model3.summary()

model3.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model3.fit(x_train, y_train,
                    epochs=5,
                    batch_size=50,
                    validation_data=(x_val, y_val))
model3.save('predicting_Obama_vs_Trump_tweets_model3.h5')

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 140, 100)          1000000   
_________________________________________________________________
conv1d (Conv1D)              (None, 138, 32)           9632      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 69, 32)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,062,933
Trainable params: 1,062,933
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
target_names = ['Obama', 'Trump']
y_pred = model3.predict_classes(x_val, verbose=0)
print(classification_report(y_val, y_pred, target_names=target_names))

              precision    recall  f1-score   support

       Obama       0.96      0.98      0.97       987
       Trump       0.98      0.96      0.97      1013

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000



In [0]:
# LSTM with pre-trained GloVe 100D embeddings
model4 = tf.keras.models.Sequential()
model4.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=maxlen))
model4.add(tf.keras.layers.LSTM(100))
model4.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model4.summary()

model4.layers[0].set_weights([embedding_matrix])
model4.layers[0].trainable = False

model4.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model4.fit(x_train, y_train,
                    epochs=10,
                    batch_size=25,
                    validation_data=(x_val, y_val))
model4.save_weights('predicting_Obama_vs_Trump_tweets_model4.h5')

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 140, 100)          1000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
target_names = ['Obama', 'Trump']
y_pred = model4.predict_classes(x_val, verbose=0)
print(classification_report(y_val, y_pred, target_names=target_names))

              precision    recall  f1-score   support

       Obama       0.91      0.90      0.91       987
       Trump       0.91      0.92      0.91      1013

    accuracy                           0.91      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.91      0.91      0.91      2000



In [0]:
# Bi-directional LSTM with pre-trained GloVe embeddings
model5 = tf.keras.models.Sequential()
model5.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model5.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)))
model5.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model5.summary()

model5.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model5.fit(x_train, y_train,
                    epochs=10,
                    batch_size=25,
                    validation_data=(x_val, y_val))
model5.save('predicting_Obama_vs_Trump_tweets_model5.h5')

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 140, 100)          1000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 201       
Total params: 1,161,001
Trainable params: 161,001
Non-trainable params: 1,000,000
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
target_names = ['Obama', 'Trump']
y_pred = model5.predict_classes(x_val, verbose=0)
print(classification_report(y_val, y_pred, target_names=target_names))

              precision    recall  f1-score   support

       Obama       0.91      0.90      0.91       987
       Trump       0.91      0.91      0.91      1013

    accuracy                           0.91      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.91      0.91      0.91      2000



In [0]:
# 1D convolution then LSTM with pre-trained GloVe embeddings
model6 = tf.keras.models.Sequential()
model6.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=maxlen,weights=[embedding_matrix],trainable=False))
model6.add(tf.keras.layers.Convolution1D(filters=32, kernel_size=3,activation='relu'))
model6.add(tf.keras.layers.MaxPooling1D(pool_size=2))
model6.add(tf.keras.layers.LSTM(100))
model6.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model6.summary()

model6.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model6.fit(x_train, y_train,
                    epochs=10,
                    batch_size=25,
                    validation_data=(x_val, y_val))
model6.save('predicting_Obama_vs_Trump_tweets_model6.h5')

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 140, 100)          1000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 138, 32)           9632      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 69, 32)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 1,062,933
Trainable params: 62,933
Non-trainable params: 1,000,000
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 

In [0]:
target_names = ['Obama', 'Trump']
y_pred = model6.predict_classes(x_val, verbose=0)
print(classification_report(y_val, y_pred, target_names=target_names))

              precision    recall  f1-score   support

       Obama       0.92      0.92      0.92       987
       Trump       0.92      0.92      0.92      1013

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000



## Insight:
Performance of the models with new embedding beign trained on the go rather than using GloVe embeddings is better. This tells us that the new embeddings being learned by the models are optimised to identify if the words would be used by Trump or Obama. i.e. two words would have close embeddings if they are more likely to be used by Obama but not Trump or viceversa. But GloVe embeddings would be similar for the words that are more interchangeable in a general context.

The fact that using GloVe embeddings is having lower accuracy shows that both Obama & Trump uses different words to convey similar message. If both of them were using the same words the models with new embeddings being learnt would have shown similar performance as the ones using GloVe embeddings. Having similar vector for different words that would convey similar message in GloVe embedding makes it difficult to differtiate between Obama & Trump tweets.