### 0. Prepare
- Below is how I train my model in Google Colab, some of the `!gdown` file is in my google drive file.

In [22]:
!pip install tensorflow
!pip install transformers
!pip install datasets



In [23]:
import tensorflow as tf
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import TFBertForSequenceClassification, BertTokenizer
from sklearn.preprocessing import LabelEncoder
from transformers import TFBertForSequenceClassification, BertConfig
import numpy as np

### 1. Loading the Data

In [24]:
# Check if GPU is available
if tf.config.list_physical_devices('GPU'):
    print("GPU is available.")
else:
    print("GPU is not available.")

GPU is available.


In [25]:
!gdown '1RbfoLt8E9RJo9LuF_S_SHDsVkPfEnXRn'  # data_identification.csv
!gdown '173oCOWXLSqPEs3igvGmXah1tiy_tPPyv'  # emotion.csv
!gdown '1C6MLuyW75UVFGuuDNT3cwuwp7Kw3MmTJ'  # tweets_DM.json

Downloading...
From: https://drive.google.com/uc?id=1RbfoLt8E9RJo9LuF_S_SHDsVkPfEnXRn
To: /content/data_identification.csv
100% 27.6M/27.6M [00:00<00:00, 69.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=173oCOWXLSqPEs3igvGmXah1tiy_tPPyv
To: /content/emotion.csv
100% 23.3M/23.3M [00:00<00:00, 37.9MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1C6MLuyW75UVFGuuDNT3cwuwp7Kw3MmTJ
From (redirected): https://drive.google.com/uc?id=1C6MLuyW75UVFGuuDNT3cwuwp7Kw3MmTJ&confirm=t&uuid=76a4b1d4-a241-43f6-a61c-83b46f2d11d7
To: /content/tweets_DM.json
100% 531M/531M [00:05<00:00, 97.3MB/s]


In [26]:
# Paths to uploaded files
data_identification_file = '/content/data_identification.csv'
emotion_file = '/content/emotion.csv'
tweets_DM_file = '/content/tweets_DM.json'

# Load CSV files
data_identification_df = pd.read_csv(data_identification_file)
emotion_df = pd.read_csv(emotion_file)


# Load the JSON file
tweets_data = []
with open(tweets_DM_file, 'r') as f:
    for line in f:
      tweet_json = json.loads(line)
      tweet_id = tweet_json['_source']['tweet']['tweet_id']
      text = tweet_json['_source']['tweet']['text']
      hashtags = tweet_json['_source']['tweet']['hashtags']
      tweets_data.append({'tweet_id': tweet_id, 'text': text, 'hashtags': hashtags})

# Convert the list of dictionaries to a DataFrame
tweets_df = pd.DataFrame(tweets_data)


In [27]:
tweets_df

Unnamed: 0,tweet_id,text,hashtags
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",[Snapchat]
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...","[freepress, TrumpLegacy, CNN]"
2,0x28b412,"Confident of your obedience, I write to you, k...",[bibleverse]
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,[]
4,0x2de201,"""Trust is not the same as faith. A friend is s...",[]
...,...,...,...
1867530,0x316b80,When you buy the last 2 tickets remaining for ...,"[mixedfeeling, butimTHATperson]"
1867531,0x29d0cb,I swear all this hard work gone pay off one da...,[]
1867532,0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...,[]
1867533,0x24faed,"Ah, corporate life, where you can date <LH> us...",[]


In [28]:
data_identification_df

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
1,0x29e452,train
2,0x2b3819,train
3,0x2db41f,test
4,0x2a2acc,train
...,...,...
1867530,0x227e25,train
1867531,0x293813,train
1867532,0x1e1a7e,train
1867533,0x2156a5,train


In [29]:
emotion_df

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation
...,...,...
1455558,0x38dba0,joy
1455559,0x300ea2,joy
1455560,0x360b99,fear
1455561,0x22eecf,joy


### 2. Merging and Splitting the Data

In [30]:
# Merge tweets_df and data_identification_df on 'tweet_id'
merged_df = pd.merge(tweets_df, data_identification_df, on='tweet_id', how='inner')

# Split into training_df and ans_df based on 'identification'
training_df = merged_df[merged_df['identification'] == 'train']
ans_df = merged_df[merged_df['identification'] == 'test']

# Merge emotion to the training_df
training_df = pd.merge(training_df, emotion_df, on='tweet_id', how='inner')

# Drop the 'identification' column if not needed
training_df = training_df.drop(columns=['identification'])
ans_df = ans_df.drop(columns=['identification'])


In [31]:
training_df = training_df.sample(frac=0.3, random_state=42).reset_index(drop=True)

### 3. Tokenization and Encoding

In [32]:
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1. Train/test split
train_df, test_df = train_test_split(training_df, test_size=0.2, random_state=42)

# 2. Separate features and labels
X_train = train_df.drop(columns=['emotion'])  # Features (input data), drop 'emotion' column
y_train = train_df['emotion']  # Labels (target)

X_test = test_df.drop(columns=['emotion'])  # Features (input data), drop 'emotion' column
y_test = test_df['emotion']  # Labels (target)

# 3. Label encoding the emotion labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(y_train)  # Encode training labels
test_labels = label_encoder.transform(y_test)  # Encode test labels

# Save the LabelEncoder to a file
joblib.dump(label_encoder, 'label_encoder.joblib')
print("LabelEncoder has been saved to 'label_encoder.joblib'.")

LabelEncoder has been saved to 'label_encoder.joblib'.


In [33]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

In [34]:
# Convert train_df and test_df to HuggingFace Datasets
train_dataset = Dataset.from_pandas(X_train)
test_dataset = Dataset.from_pandas(X_test)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Add labels to datasets
train_dataset = train_dataset.add_column("labels", train_labels)
test_dataset = test_dataset.add_column("labels", test_labels)

Map:   0%|          | 0/349335 [00:00<?, ? examples/s]

Map:   0%|          | 0/87334 [00:00<?, ? examples/s]

### 4. TensorFlow Dataset Conversion

In [35]:
def dataset_to_tf(dataset, with_labels=True, batch_size=32):
    # Prepare inputs
    inputs = {
        'input_ids': dataset['input_ids'],
        'attention_mask': dataset['attention_mask'],
    }

    # Include token_type_ids if available
    if 'token_type_ids' in dataset.features:
        inputs['token_type_ids'] = dataset['token_type_ids']

    if with_labels and 'labels' in dataset.column_names:
        labels = dataset['labels']
        return tf.data.Dataset.from_tensor_slices((
            inputs,
            labels
        )).batch(batch_size).shuffle(buffer_size=1000).prefetch(tf.data.AUTOTUNE)
    else:
        return tf.data.Dataset.from_tensor_slices(inputs).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [36]:
# Convert train and test datasets to TensorFlow dataset
train_dataset_tf = dataset_to_tf(train_dataset, with_labels=True, batch_size=32)
test_dataset_tf = dataset_to_tf(test_dataset, with_labels=True, batch_size=32)

### 5. Model Training

In [37]:
# Define number of labels
num_labels = len(set(train_dataset['labels']))

# Load pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

# Ensure to use the correct `Adam` optimizer
model.compile(optimizer=optimizer,  # Pass the optimizer instance
              loss=loss,
              metrics=metrics)

# Train the model
history = model.fit(train_dataset_tf, validation_data=test_dataset_tf, epochs=3)

# Print the training and validation accuracy after training
print("Training Accuracy: ", history.history['accuracy'][-1])
print("Validation Accuracy: ", history.history['val_accuracy'][-1])

# Save the trained model
model.save_pretrained("./emotion_classification_model")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Training Accuracy:  0.7248343229293823
Validation Accuracy:  0.6280372142791748


### 6.  Predict and Add Predictions to ans_df

### 6.2 Convert ans_df

In [41]:
# Convert ans_df to Hugging Face Dataset
ans_dataset = Dataset.from_pandas(ans_df)


# Tokenize ans_df using the tokenizer
ans_dataset = ans_dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/411972 [00:00<?, ? examples/s]

In [42]:
# Convert ans_dataset to TensorFlow dataset without labels
ans_dataset_tf = dataset_to_tf(ans_dataset, with_labels=False)

### 6.3 Predict

In [43]:
# Run predictions on the dataset
predictions = model.predict(ans_dataset_tf)

# Extract the predicted labels (logits) and convert to classes (use argmax)
predicted_labels = np.argmax(predictions.logits, axis=-1)

# Inverse transform the numerical labels to original emotion categories
predicted_emotions = label_encoder.inverse_transform(predicted_labels)

# Add the predictions to the dataframe as a new 'emotion' column
ans_df['emotion'] = predicted_emotions

print(ans_df.head())

    tweet_id                                               text  \
2   0x28b412  Confident of your obedience, I write to you, k...   
4   0x2de201  "Trust is not the same as faith. A friend is s...   
9   0x218443  When do you have enough ? When are you satisfi...   
30  0x2939d5  God woke you up, now chase the day #GodsPlan #...   
33  0x26289a  In these tough times, who do YOU turn to as yo...   

                             hashtags       emotion  
2                        [bibleverse]  anticipation  
4                                  []  anticipation  
9   [materialism, money, possessions]           joy  
30               [GodsPlan, GodsWork]           joy  
33                                 []         trust  


### 7. Save in CSV File

In [44]:
ans_df_subset = ans_df[['tweet_id', 'emotion']]

# Rename 'tweet_id' to 'id'
ans_df_subset = ans_df_subset.rename(columns={'tweet_id': 'id'})

ans_df_subset.to_csv('predictions.csv', index=False)

In [45]:
ans_df_subset.shape

(411972, 2)

### 6.1 This part is for reloading the model which is trained already

In [None]:
`!gdown '1ZO-Hs5DD0QeBYhKk8Ff7AQMPspCKSZit' # tf_model.h5
!gdown '1nTCZv2g3bOugbD6wWxLxeJtPRZ9BSQwk'  # config.json

In [None]:
import pickle

# Load the LabelEncoder from the file
with open('label_encoder.pkl', 'rb') as file:
    label_encoder = pickle.load(file)
print("LabelEncoder has been loaded from 'label_encoder.pkl'.")

In [None]:
# Load the configuration
config = BertConfig.from_json_file('config.json')

# Load the model with the configuration
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

# Load the weights from the .h5 file (TensorFlow model)
model.load_weights('tf_model.h5')  # Ensure the path is correct

# Verify the model architecture
print(model.summary())
