In [None]:
"""
To-Do list:

1. Tokenize the words
2. each category should have 7500 words
3. use glove to make embeddings
4. encode emoji labels
"""

'\n1. Tokenize the words\n3. each category should have 7500 words\n2. use glove to make embeddings\n4. encode emoji labels\n'

In [None]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize
import re

# import these modules
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df = pd.read_csv('/content/drive/MyDrive/emojien/processed_tweets.csv')

In [None]:
df = df.dropna()

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def normalize_stretched_word(word):
    return re.sub(r'(.)\1+', r'\1\1', word)  # Replace repeated characters with two occurrences

def lemmatize_and_normalize_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    normalized_tokens = [normalize_stretched_word(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in normalized_tokens]
    return ' '.join(lemmatized_tokens)

In [None]:
df['processed_text'] = df['processed_text'].apply(lemmatize_and_normalize_text)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['processed_text'])

In [None]:
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [None]:
vocab_size #147913

142180

In [None]:
# Loading pre-trained glove word embeddings

def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            try:
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            except ValueError as e:
                print(f"Skipping line due to error: {e}")
                continue
    return embeddings_index

glove_file_path = '/content/drive/MyDrive/glove/glove_embeddings/glove.840B.300d.txt'  # Replace with your actual path to your GloVe file
embeddings_index = load_glove_embeddings(glove_file_path)

Skipping line due to error: could not convert string to float: '.'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: '.'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: '.'
Skipping line due to error: could not convert string to float: '.'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: 'Killerseats.com'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Ski

In [None]:
embedding_dim = 300  # For GloVe 100d
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros
        embedding_matrix[i] = embedding_vector

In [None]:
df['sequences'] = tokenizer.texts_to_sequences(df['processed_text'])

In [None]:
empty_rows = np.all(embedding_matrix == 0, axis=1) #no. of tokens with no word embedding
np.sum(empty_rows)

71647

In [None]:
df['emoji_len'] = df['emojis'].apply(len)

df = df[df['emoji_len'] < 10] #removing rows with more than 10 emojis

In [None]:
emojis = df.explode('emojis')['emojis'].unique()

In [None]:
new_df = pd.DataFrame()

In [None]:
from sklearn.utils import resample

# Resample each emoji category to ensure exactly 7500 texts, then shuffle and concatenate them.
for emoji in emojis:
    emoji_data = df[df['emojis'] == emoji]
    emoji_count = len(emoji_data)

    if emoji_count > 7500:
        emoji_df = emoji_data.sample(n=7500, random_state=42)

    elif emoji_count < 7500:
        emoji_df = resample(emoji_data, replace=True, n_samples=7500, random_state=42)

    else:
        emoji_df = emoji_data

    emoji_df = emoji_df.sample(frac=1, random_state=42)

    new_df = pd.concat([new_df, emoji_df], ignore_index=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode the emojis column

one_hot_encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = one_hot_encoder.fit_transform(new_df[['emojis']])

In [None]:
categories = one_hot_encoder.categories_[0]
index_to_emoji = {index: emoji for index, emoji in enumerate(categories)} # Create a dictionary to map indices to emojis

In [None]:
new_df['emoji_labels'] = one_hot_encoded.tolist()

In [None]:
data = new_df[['sequences', 'emoji_labels']]

In [None]:
df['sequences'].apply(len).max()

47

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_sequence_length = 47
data['padded_sequences'] = pad_sequences(data['sequences'], maxlen=max_sequence_length, padding='pre').tolist() # Pad sequences to a fixed length - 47

In [None]:
import pickle

with open('/content/drive/MyDrive/emojien/Data/emoji_data.pkl', 'wb') as f:
    pickle.dump({
        'data': data,
        'tokenizer': tokenizer,
        'emoji_dict': index_to_emoji,
        'embedding_matrix': embedding_matrix
    }, f)