# Trends-based recommendation
In this notebook we will classify the tweets into trends and these trends will help us decide which hashtags to recommend 

In [2]:
from files_reader import *
import tensorflow as tf
import nltk

## Get the data

In [3]:
tweets_and_trends = []
tweets = []
trends = []

# tweets_and_trends += (FilesReader.read_file(UK_tweets_file))
# tweets_and_trends += (FilesReader.read_file(US_tweets_file))
# tweets_and_trends += (FilesReader.read_file(AUS_tweets_file))
# tweets_and_trends += (FilesReader.read_file(IR_tweets_file))
# tweets_and_trends += (FilesReader.read_file(CAN_tweets_file))
tweets_and_trends += (FilesReader.read_file(new_US_file))
tweets_and_trends += (FilesReader.read_file(new_UK_file))
tweets_and_trends += (FilesReader.read_file(new_AUS_file))
tweets_and_trends += (FilesReader.read_file(new_CAN_file))
tweets_and_trends += (FilesReader.read_file(new_IR_file))
# tweets_and_trends += (FilesReader.read_file(new_SINGA_file))   
# tweets_and_trends += (FilesReader.read_file(new_SA_file))

random.shuffle(tweets_and_trends)

tweets, trends = FilesReader.split_tweets_and_trends(tweets_and_trends)
print(f"We have {len(tweets)} tweets.")


We have 407753 tweets.


## Stemming
i.e turning words like playing, plays played to play

In [4]:
from nltk.stem import SnowballStemmer


def stem_tweet(tweet: str) -> str:
    """
    Stems a tweet.
    :param tweet: The tweet to stem.
    :return: The stemmed tweet.
    """
    ps = SnowballStemmer("english")
    new_tweet = ""
    for word in tweet.split(' '):
        new_tweet += ps.stem(word) + ' '
    return new_tweet

## Lemmatization 

In [5]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

def lemmatize_tweet(tweet: str) -> str:
    """
    Lemmatizes a tweet.
    :param tweet: The tweet to lemmatize.
    :return: The lemmatized tweet.
    """
    lemmatizer = WordNetLemmatizer()
    new_tweet = ""
    for word in tweet.split(' '):
        new_tweet += lemmatizer.lemmatize(word) + ' '
    return new_tweet

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
print(f"before: {tweets[:3]}")
for i, tweet in enumerate(tweets):
    processed_tweet = stem_tweet(tweet)
    tweets[i] = processed_tweet
    

print(f"after: {tweets[:3]}")

before: ['humiliation home', 'first history see face', 'roasted ad menu tonight']
after: ['humili home ', 'first histori see face ', 'roast ad menu tonight ']


## Tokenize the text

In [7]:
from keras.preprocessing.text import Tokenizer

tweets_tokenizer = Tokenizer(oov_token="<OOV>")
tweets_tokenizer.fit_on_texts(tweets)
tweets_word_index = tweets_tokenizer.word_index
tweets_index_word = tweets_tokenizer.index_word

print(f"We have {len(tweets_word_index)} different words")
print(tweets_word_index)

We have 88048 different words


## Create the padded sequences

In [8]:
from keras.utils import pad_sequences
sequence_length = 15

tweets_sequences = tweets_tokenizer.texts_to_sequences(tweets)
tweets_sequences_padded = pad_sequences(tweets_sequences, padding="post", maxlen=sequence_length)

## Map the trends to numbers

In [9]:
trends_map = {}

counter = 0

for trend in trends:
    if not (trend in trends_map):
        trends_map[trend] = counter
        counter += 1

no_of_trends = len(trends_map)
inv_trends_map = {v: k for k, v in trends_map.items()}
print(f"We have {no_of_trends} different trends")
print(trends_map)

We have 993 different trends
{'ARSBRI': 0, 'NHLPlayoffs': 1, 'LakeShow': 2, 'Anzac Day': 3, 'Iceland': 4, 'SAFC': 5, 'Eurovision2023': 6, 'rufc': 7, 'Zelda': 8, 'International Nurses Day': 9, 'BHAEVE': 10, 'thunderstorm': 11, 'GAAGO': 12, 'Latham': 13, 'NRLBulldogsWarriors': 14, 'NRLRoostersDragons': 15, 'Budget2023': 16, 'Dennis': 17, 'imran_Khan': 18, 'MilanInter': 19, 'ImACeleb': 20, 'Snapchat': 21, 'Howard Webb': 22, 'MentalHealthAwarenessMonth': 23, 'Flames': 24, 'lambie': 25, 'ThisMorning': 26, 'JediSurvivor': 27, 'Speers': 28, 'Pakistan': 29, 'RHOA': 30, 'MayDay': 31, 'IND2023': 32, 'DubNation': 33, 'TearsOfTheKingdom': 34, 'Paul Burrell': 35, 'LIVTOT': 36, 'Ishbia': 37, 'LEILIV': 38, 'afldeestigers': 39, 'PAKvNZ': 40, 'EuropeDay': 41, 'Gnonto': 42, 'LabourDay': 43, 'aloneaustralia': 44, 'AFLCrowsPies': 45, 'ATLvsBOS': 46, 'LEGOMastersAU': 47, "Mother's Day": 48, 'GoAvsGo': 49, 'MasterChefAU': 50, '911onFOX': 51, 'FLAvsBOS': 52, 'AFLTigersCats': 53, 'AFLPiesDons': 54, 'bbcpm': 5

## Create the trends sequences

In [10]:
trends_sequences = [trends_map[trend] for trend in trends]
print(trends_sequences)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 2, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 6, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 32, 52, 53, 54, 55, 50, 56, 57, 58, 59, 60, 17, 4, 38, 31, 61, 62, 38, 8, 63, 64, 50, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 20, 75, 69, 45, 76, 20, 77, 78, 15, 79, 31, 80, 69, 81, 82, 66, 83, 84, 85, 15, 86, 87, 88, 19, 89, 90, 91, 92, 18, 93, 94, 52, 95, 32, 66, 34, 96, 97, 98, 99, 100, 101, 6, 18, 102, 103, 30, 104, 78, 105, 86, 106, 107, 108, 75, 8, 85, 109, 110, 111, 42, 58, 112, 113, 114, 88, 107, 39, 115, 116, 85, 75, 117, 48, 118, 111, 26, 119, 120, 76, 1, 83, 121, 8, 122, 47, 123, 34, 111, 6, 124, 49, 125, 2, 126, 5, 58, 127, 128, 62, 118, 129, 87, 0, 130, 85, 131, 98, 132, 133, 134, 84, 135, 136, 111, 122, 4, 137, 41, 47, 41, 119, 69, 7, 105, 138, 137, 47, 104, 36, 139, 50, 138, 66, 50, 56, 87, 46, 6, 140, 39, 2, 141, 118, 68, 142, 143, 50, 104, 50, 144, 20, 41, 15, 145, 75

## Encode the trends

In [11]:
from keras.utils import to_categorical
import tensorflow as tf

encoded_trends = to_categorical(trends_sequences)
print(encoded_trends.shape)

(407753, 993)


## Prepare the pre-trained embeddings

In [12]:
from Embeddings.embeddings_matrix import get_embeddings_matrix

embeddings_index_path = "./Embeddings/embeddings_index_object.pkl"
embeddings_matrix, hits, misses = get_embeddings_matrix(tweets_word_index, embeddings_index_path)

print(f"Hits: {hits}, Misses: {misses}")

Hits: 41439, Misses: 46609


## Split the data

In [13]:
training_split = 0.8
training_tweets_count = int(0.8 * len(tweets_sequences_padded))

In [14]:
train_data = tweets_sequences_padded[0:training_tweets_count]
train_labels = encoded_trends[0:training_tweets_count]
test_data = tweets_sequences_padded[training_tweets_count:]
test_labels = encoded_trends[training_tweets_count:]

print(f'we have {len(train_data)} tweets for training and {len(test_data)} for testing')

we have 326202 tweets for training and 81551 for testing


## Tune the hyper-parameters

In [15]:
# import keras_tuner as kt
# import keras
# from keras import initializers
# import tensorflow as tf

# no_of_tweets_words = len(tweets_word_index) + 1
# embedding_dimensions = 300

# def model_builder(hp):
#     model = keras.Sequential()

#     hp_conv_filters = hp.Int('conv_filters', min_value=32, max_value=256, step=16)
#     hp_kernel_size = hp.Int('conv_kernel_size', min_value=2, max_value=5, step=1)
#     hp_dropout = hp.Float('dropout', min_value=0.0, max_value=0.6, step=0.1)
#     hp_lstm = hp.Int('lstm_units', min_value=32, max_value=256, step=16)

#     model.add(keras.layers.Embedding(
#         no_of_tweets_words,
#         embedding_dimensions,
#         input_length=sequence_length,
#         embeddings_initializer=initializers.Constant(embeddings_matrix),
#         trainable=True
#     ))
#     model.add(keras.layers.Conv1D(hp_conv_filters, hp_kernel_size, padding='same'))
#     model.add(keras.layers.Dropout(hp_dropout))
#     model.add(keras.layers.Bidirectional(keras.layers.LSTM(hp_lstm)))
#     model.add(keras.layers.Dropout(hp_dropout))
#     model.add(keras.layers.Dense(no_of_trends))

#     # Tune the learning rate for the optimizer
#     # Choose an optimal value from 0.01, 0.001, or 0.0001
#     hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

#     model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
#                   loss=keras.losses.CategoricalCrossentropy(
#                       from_logits=True),
#                   metrics=['accuracy'])

#     return model


In [16]:
# tuner = kt.Hyperband(model_builder,
#                      objective='val_accuracy',
#                      max_epochs=6,
#                      factor=3,
#                      directory='parameters_tuning',
#                      project_name='trends_classifier')

In [17]:
# tuner.search(train_data, train_labels, epochs=50, validation_data=(test_data, test_labels))

In [18]:
# best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

# print(f"""
# The hyperparameter search is complete. Here are the optimal configurations:
#     conv_filters: {best_hps.get('conv_filters')}
#     conv_kernel_size: {best_hps.get('conv_kernel_size')}
#     lstm_units: {best_hps.get('lstm_units')}
#     dropout: {best_hps.get('dropout')}
#     learning_rate: {best_hps.get('learning_rate')}
# """)

In [19]:
# trends_classifier = tuner.hypermodel.build(best_hps)
# history = trends_classifier.fit(train_data, train_labels, epochs=20, validation_data=(test_data, test_labels))

# val_acc_per_epoch = history.history['val_accuracy']
# best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
# print('Best epoch: %d' % (best_epoch,))

In [20]:
# trends_classifier = tuner.hypermodel.build(best_hps)
# trends_classifier.fit(train_data, train_labels, epochs=best_epoch, validation_data=(test_data, test_labels))

## Build the model
After tuning the hyper-parameters, here are the optimal configurations: <br>
    conv_filters: 224 <br>
    conv_kernel_size: 2<br>
    lstm_units: 144<br>
    dropout: 0.1<br>
    learning_rate: 0.001<br>

In [21]:
#hyperparameters
embedding_dimensions = 300
lstm_units = 144
dropout_value = 0.1
conv_filters = 224
conv_kernel_size = 2

In [22]:
from keras import initializers
import tensorflow as tf

no_of_tweets_words = len(tweets_word_index) + 1

trends_classifier = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        no_of_tweets_words,
        embedding_dimensions,
        input_length=sequence_length,
        embeddings_initializer=initializers.Constant(embeddings_matrix),
        trainable=True
    ),
    tf.keras.layers.Conv1D(conv_filters, conv_kernel_size),
    tf.keras.layers.AveragePooling1D(),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units)),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Dense(no_of_trends, activation='softmax')
])

trends_classifier.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

trends_classifier.summary()

# trends_classifier = tf.keras.models.load_model("trends_classifier")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 300)           26414700  
                                                                 
 conv1d (Conv1D)             (None, 14, 224)           134624    
                                                                 
 average_pooling1d (AverageP  (None, 7, 224)           0         
 ooling1D)                                                       
                                                                 
 dropout (Dropout)           (None, 7, 224)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 288)              425088    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 288)               0

In [24]:
epochs = 4
trends_classifier.fit(train_data, train_labels, epochs=epochs, validation_data=(test_data, test_labels))

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

## Save the model

In [None]:
import pickle

trends_classifier.save("./trends_classifier/trends_classifier_model.h5")
with open('./trends_classifier/inv_trends_map.pkl', 'wb') as output:
    pickle.dump(inv_trends_map, output)
with open('./trends_classifier/tweet_tokenizer.pkl', 'wb') as output:
    pickle.dump(tweets_tokenizer, output)


In [None]:
import numpy as np

def predict(tweet, tweet_tokenizer, trends_map, inv_trends_map, pad_length, model):
    tweet_sequence = tweet_tokenizer.texts_to_sequences([tweet])[0]
    padded_tweet_sequence = pad_sequences([tweet_sequence], maxlen=pad_length, padding='post')
    prediction = (model.predict(padded_tweet_sequence))
    trends_indices = np.argsort(prediction, axis=-1)[0][-3:]
    return [inv_trends_map[trend_index] for trend_index in trends_indices]

In [None]:
tweet = "happy may day."

print(predict(tweet, tweets_tokenizer, trends_map, inv_trends_map, sequence_length, trends_classifier))