## Get the data
csv headings: id, created_at, source, original_text, clean_text, favorite_count, retweet_count, hashtags, trend <br>
hashtags format: strings with comma separated hashtags

In [3]:
import numpy as np
import random

In [4]:
#File paths
US_tweets_file = '../Data/USTweets.csv'
UK_tweets_file = '../Data/UKTweets.csv'
CAN_tweets_file = '../Data/CANTweets.csv'
IR_tweets_file = '../Data/IRTweets.csv'
AUS_tweets_file = '../Data/AUSTweets.csv'

In [5]:
import csv

tweets_and_hashtags = [] #to shuffle
tweets = []
hashtags = [] #list of lists of hashtags e.g. hashtags[0] = ["hashtag1", "hashtag2"]
hashtags_strings = [] #list of hashtags string e.g. hashtags[0] = ["hashtag1, hashtag2"]


In [6]:
def read_file(file_name):
    with open(file_name) as data_file:
        data = csv.reader(data_file)
        for row in data:
            new_row = []
            new_row.append(row[4])
            new_row.append(row[7].split(", "))
            new_row.append(row[7])
            tweets_and_hashtags.append(new_row)

In [7]:
read_file(UK_tweets_file)
read_file(IR_tweets_file)

In [8]:
random.shuffle(tweets_and_hashtags)
for row in tweets_and_hashtags:
    tweets.append(row[0])
    hashtags.append(row[1])
    hashtags_strings.append(row[2])

del tweets_and_hashtags

## Initialize the tokenizers
Will use a specialized tokenizer for the hashtags because we need to encode all the hashtags. It also does not matter if the encoding of the tweets match the encoding of the hashtags.

In [9]:
from keras.preprocessing.text import Tokenizer

tweets_tokenizer = Tokenizer(oov_token="<OOV>")
tweets_tokenizer.fit_on_texts(tweets)
tweets_word_index = tweets_tokenizer.word_index
tweets_index_word = tweets_tokenizer.index_word

In [10]:
hashtags_tokenizer = Tokenizer(oov_token="<OOV>")
hashtags_tokenizer.fit_on_texts(hashtags_strings)
hashtags_word_index = hashtags_tokenizer.word_index
hashtags_index_word = hashtags_tokenizer.index_word

In [11]:
print(f'There are {len(tweets)} tweets, ')
print(f'the tweets contain {len(tweets_tokenizer.word_index)} different words.')
print(f'There are {len(hashtags_tokenizer.word_index)} different hashtags')
print('Here are the tokenized hashtags')
print(hashtags_word_index)

There are 160231 tweets, 
the tweets contain 63792 different words.
There are 29784 different hashtags
Here are the tokenized hashtags


## Prepare the pre-trained embeddings

In [12]:
import pickle

num_tokens = len(tweets_word_index) + 1
embedding_dim = 300
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
with open('../NLP/Embeddings/embeddings_index_object.pkl', 'rb') as embeddings_file:
    embeddings_index = pickle.load(embeddings_file)
    for word, i in tweets_word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            if embedding_vector.shape == (300,):
                embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1

In [13]:
print(f'hits:{hits}, misses: {misses}')

hits:45274, misses: 18518


## Create the sequences and pad them and one multi-hot encode the hashtags
Will use a binary vector to encode the hashtags to the model can categorize the tweets. e.g. hashtags[0] = [tag1, tag2], and tag1 has encoding of 1 and tag2 has encoding 2, then the binary vector wil be [0 1 1 0 0 ... no_of_different_hashtags]

In [14]:
sequence_length = 20

from keras.utils import pad_sequences
tweets_sequences = tweets_tokenizer.texts_to_sequences(tweets)
hashtags_sequences = hashtags_tokenizer.texts_to_sequences(hashtags)
tweets_sequences_padded = pad_sequences(tweets_sequences, padding="post", maxlen=sequence_length)

In [None]:
del tweets
del hashtags

In [15]:
from keras.utils import to_categorical
import tensorflow as tf

no_of_different_hashtags = len(hashtags_word_index) + 1
no_of_hashtags = len(hashtags_sequences)

encoded_hashtags = np.zeros((no_of_hashtags, no_of_different_hashtags))

for i, hashtags_indices in enumerate(hashtags_sequences):
    encoded_hashtags[i][hashtags_indices] = 1

MemoryError: Unable to allocate 35.6 GiB for an array with shape (160231, 29785) and data type float64

## Split the data

In [16]:
training_split = 0.8
training_tweets_count = int(0.8 * len(tweets_sequences_padded))

In [18]:
train_data = tweets_sequences_padded[0:training_tweets_count]
train_labels = encoded_hashtags[0:training_tweets_count]
test_data = tweets_sequences_padded[training_tweets_count:]
test_labels = encoded_hashtags[training_tweets_count:]

print(f'we have {len(train_data)} tweets for training and {len(test_data)} for testing')

we have 128184 tweets for training and 32047 for testing


In [None]:
del tweets_sequences
del tweets_sequences_padded
del hashtags_strings
del hashtags_sequences
del encoded_hashtags

## Build the model

In [None]:
#learning rate callback
def lr_schedule(epoch):
    lr = 0.001
    if epoch > 8:
        lr = 0.0005
    return lr

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_schedule)

In [None]:
#hyperparameters
embedding_dimensions = 300
lstm_units = 128
dropout_value = 0.2
conv_filters = 64
conv_kernel_size = 5
dense_layers = 10000

In [None]:
from keras import initializers

no_of_tweets_words = len(tweets_word_index) + 1

hashtag_recommender_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(no_of_tweets_words,
                            embedding_dimensions,
                            input_length=sequence_length,
                            embeddings_initializer=initializers.Constant(embedding_matrix),
                            trainable=True),
    tf.keras.layers.Conv1D(conv_filters, conv_kernel_size, activation='relu'),
    tf.keras.layers.Dropout(dropout_value),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, return_sequences=True)),
    # tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units)),
    tf.keras.layers.Dropout(dropout_value),
    # tf.keras.layers.Dense(dense_layers, activation='relu'),
    # tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Dense(no_of_different_hashtags, activation='softmax')
])

hashtag_recommender_model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics = ['accuracy'],
)

hashtag_recommender_model.summary()

In [None]:
del embedding_matrix

## Train the model

In [None]:
epochs = 12
train_data_size = len(train_data)
hashtag_recommender_model.fit(train_data, train_labels, epochs=epochs, validation_data=(test_data, test_labels), callbacks=[lr_scheduler])


## Get hashtags!!

In [None]:
def predict(tweet, tweet_tokenizer, hashtag_tokenizer, pad_length, model):
    tweet_sequence = tweet_tokenizer.texts_to_sequences([tweet])[0]
    padded_tweet_sequence = pad_sequences([tweet_sequence], maxlen=pad_length, padding='post')
    prediction = (model.predict(padded_tweet_sequence))
    hashtag_indices = np.argsort(prediction, axis=-1)[0][-3:]
    return [hashtag_tokenizer.index_word[hashtag_index] for hashtag_index in hashtag_indices]


In [None]:
print(predict("I can't believe it", tweets_tokenizer, hashtags_tokenizer, sequence_length, hashtag_recommender_model))