## Importing Modules

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd
import nltk

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, GPT2Config, TextDataset
from tqdm.auto import tqdm
import random
import datetime
import time
import statistics
from nltk.translate.bleu_score import sentence_bleu
from transformers import TrainingArguments, Trainer, set_seed, EvalPrediction, DataCollatorWithPadding
from datasets import load_dataset
from datasets import Dataset
from transformers import AutoModel, AutoTokenizer, TFAutoModel, AutoModelForSequenceClassification
from peft import LoraConfig, PeftModelForSequenceClassification, TaskType, AutoPeftModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'

In [3]:
models = os.listdir(MODEL_PATH)
models

['bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gpt2',
 'gpt2-large',
 'gpt2-medium']

In [4]:
model_path = MODEL_PATH + '\\' + models[8]
model_path

'D:\\Python\\LLM_Environment\\models\\gpt2'

In [5]:
torch.cuda.empty_cache()

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

## Import Dataset

In [7]:
filenames = os.listdir(DATASET_PATH)
filenames

['cached_lm_GPT2Tokenizer_128_Shakespeare_Dataset.txt',
 'Customer.csv',
 'Html.csv',
 'Recipes.csv',
 'Recipes_1000.csv',
 'Shakespeare_Dataset.txt',
 'Taylor_Swift_Lyrics.csv',
 'Twitter.csv']

In [8]:
file_path = DATASET_PATH + '\\' + filenames[7]
file_path

'D:\\Python\\LLM_Environment\\datasets\\Twitter.csv'

In [9]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,emoji
0,0,bet you'll get hungry,heart_eyes
1,1,starbucks employee confuses boyfriend by sayin...,yum
2,2,when your starbucks store makes you an iced mo...,sob
3,3,"being told ""girl your romper looks fierce!"" at...",blush
4,4,"i got a starbucks drink at school today, shit ...",sob


In [10]:
# Encode the emoji labels into numerical format
unique_emojis = df['emoji'].unique()
emoji2id = {emoji: id for id, emoji in enumerate(unique_emojis)}
id2emoji = {id: emoji for emoji, id in emoji2id.items()}

# Add a new column for the encoded labels
df['label'] = df['emoji'].map(emoji2id)

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, stratify=df['label'])

# Convert the dataframes into Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [11]:
tokenizer.pad_token = tokenizer.eos_token

# Tokenize and convert
def tokenize_and_encode(examples):
    tokenized_inputs = tokenizer(examples['tweet'], padding="max_length", truncation=True, max_length=512)
    tokenized_inputs['labels'] = examples['label']
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_encode, batched=True)
val_dataset = val_dataset.map(tokenize_and_encode, batched=True)

Map: 100%|██████████| 202797/202797 [00:33<00:00, 6119.71 examples/s]
Map: 100%|██████████| 22534/22534 [00:03<00:00, 5775.59 examples/s]


In [12]:
train_dataset

Dataset({
    features: ['Unnamed: 0', 'tweet', 'emoji', 'label', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 202797
})

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# Compute metrics function
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    return {"accuracy": accuracy_score(p.label_ids, preds), "f1": f1, "precision": precision, "recall": recall}

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(unique_emojis))
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at D:\Python\LLM_Environment\models\gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
save_path = './model'
# Define the training arguments
training_args = TrainingArguments(
    output_dir=save_path,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=100,
    warmup_ratio=0.1,
)



In [17]:
# Initialize the Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.select(range(10000)),
    eval_dataset=val_dataset.select(range(1000)),
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [18]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  8%|▊         | 100/1250 [00:44<08:24,  2.28it/s]

{'loss': 6.7605, 'grad_norm': 34.516395568847656, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.08}


 16%|█▌        | 200/1250 [01:28<07:49,  2.24it/s]

{'loss': 2.3396, 'grad_norm': 11.90102767944336, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.16}


 24%|██▍       | 300/1250 [02:13<07:08,  2.22it/s]

{'loss': 2.2224, 'grad_norm': 8.523725509643555, 'learning_rate': 1.688888888888889e-05, 'epoch': 0.24}


 32%|███▏      | 400/1250 [02:59<06:32,  2.17it/s]

{'loss': 2.1284, 'grad_norm': 7.627568244934082, 'learning_rate': 1.5111111111111112e-05, 'epoch': 0.32}


 40%|████      | 500/1250 [03:46<05:56,  2.11it/s]

{'loss': 2.1081, 'grad_norm': 10.487789154052734, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.4}


 48%|████▊     | 600/1250 [04:34<05:09,  2.10it/s]

{'loss': 2.128, 'grad_norm': 8.383912086486816, 'learning_rate': 1.1555555555555556e-05, 'epoch': 0.48}


 56%|█████▌    | 700/1250 [05:23<04:30,  2.03it/s]

{'loss': 2.0693, 'grad_norm': 7.677370548248291, 'learning_rate': 9.777777777777779e-06, 'epoch': 0.56}


 64%|██████▍   | 800/1250 [06:12<03:38,  2.06it/s]

{'loss': 2.0753, 'grad_norm': 8.724266052246094, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.64}


 72%|███████▏  | 900/1250 [07:01<02:51,  2.04it/s]

{'loss': 2.0191, 'grad_norm': 10.59865951538086, 'learning_rate': 6.222222222222223e-06, 'epoch': 0.72}


 80%|████████  | 1000/1250 [07:50<02:02,  2.04it/s]

{'loss': 1.9927, 'grad_norm': 10.749558448791504, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.8}


 88%|████████▊ | 1100/1250 [08:39<01:14,  2.03it/s]

{'loss': 1.9915, 'grad_norm': 12.196487426757812, 'learning_rate': 2.666666666666667e-06, 'epoch': 0.88}


 96%|█████████▌| 1200/1250 [09:28<00:24,  2.02it/s]

{'loss': 2.0317, 'grad_norm': 8.614356994628906, 'learning_rate': 8.88888888888889e-07, 'epoch': 0.96}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

100%|██████████| 1250/1250 [10:14<00:00,  2.00it/s]

{'eval_loss': 2.004523277282715, 'eval_accuracy': 0.301, 'eval_f1': 0.2562244446522303, 'eval_precision': 0.34590116264367254, 'eval_recall': 0.301, 'eval_runtime': 19.6195, 'eval_samples_per_second': 50.97, 'eval_steps_per_second': 6.371, 'epoch': 1.0}


100%|██████████| 1250/1250 [10:16<00:00,  2.03it/s]

{'train_runtime': 617.0898, 'train_samples_per_second': 16.205, 'train_steps_per_second': 2.026, 'train_loss': 2.468174346923828, 'epoch': 1.0}





TrainOutput(global_step=1250, training_loss=2.468174346923828, metrics={'train_runtime': 617.0898, 'train_samples_per_second': 16.205, 'train_steps_per_second': 2.026, 'total_flos': 2613156249600000.0, 'train_loss': 2.468174346923828, 'epoch': 1.0})

In [19]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=10, bias=False)
)

In [20]:
# PEFT model configuration
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=4,
    lora_alpha=16,
    lora_dropout=0.1
)

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(unique_emojis))
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at D:\Python\LLM_Environment\models\gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
peft_model = PeftModelForSequenceClassification(model, peft_config)

# Print
peft_model.print_trainable_parameters()

trainable params: 155,136 || all params: 124,602,624 || trainable%: 0.1245




In [23]:
# Compute metrics function
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    return {"accuracy": accuracy_score(p.label_ids, preds), "f1": f1, "precision": precision, "recall": recall}

In [24]:
peft_path = './peft_model'
# Define the training arguments
training_args = TrainingArguments(
    output_dir=peft_path,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs/peft_model',
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=100,
    warmup_ratio=0.1,
)



In [25]:
# Initialize the Trainer with compute_metrics
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset.select(range(10000)),
    eval_dataset=val_dataset.select(range(1000)),
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [26]:
# Start training
trainer.train()

  8%|▊         | 100/1250 [00:38<07:30,  2.55it/s]

{'loss': 9.7796, 'grad_norm': 197.31263732910156, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.08}


 16%|█▌        | 200/1250 [01:17<06:50,  2.56it/s]

{'loss': 8.3495, 'grad_norm': 146.02621459960938, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.16}


 24%|██▍       | 300/1250 [01:56<06:15,  2.53it/s]

{'loss': 6.3327, 'grad_norm': 138.65345764160156, 'learning_rate': 1.688888888888889e-05, 'epoch': 0.24}


 32%|███▏      | 400/1250 [02:35<05:27,  2.59it/s]

{'loss': 3.8677, 'grad_norm': 37.408973693847656, 'learning_rate': 1.5111111111111112e-05, 'epoch': 0.32}


 40%|████      | 500/1250 [03:14<04:52,  2.56it/s]

{'loss': 2.9151, 'grad_norm': 70.88046264648438, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.4}


 48%|████▊     | 600/1250 [03:54<04:18,  2.52it/s]

{'loss': 2.6592, 'grad_norm': 23.853919982910156, 'learning_rate': 1.1555555555555556e-05, 'epoch': 0.48}


 56%|█████▌    | 700/1250 [04:33<03:35,  2.55it/s]

{'loss': 2.6112, 'grad_norm': 25.212936401367188, 'learning_rate': 9.777777777777779e-06, 'epoch': 0.56}


 64%|██████▍   | 800/1250 [05:12<02:56,  2.56it/s]

{'loss': 2.5161, 'grad_norm': 19.571550369262695, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.64}


 72%|███████▏  | 900/1250 [05:52<02:17,  2.55it/s]

{'loss': 2.5713, 'grad_norm': 27.05467987060547, 'learning_rate': 6.222222222222223e-06, 'epoch': 0.72}


 80%|████████  | 1000/1250 [06:31<01:38,  2.55it/s]

{'loss': 2.479, 'grad_norm': 37.3812370300293, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.8}


 88%|████████▊ | 1100/1250 [07:10<00:59,  2.51it/s]

{'loss': 2.4262, 'grad_norm': 20.682153701782227, 'learning_rate': 2.666666666666667e-06, 'epoch': 0.88}


 96%|█████████▌| 1200/1250 [07:50<00:19,  2.50it/s]

{'loss': 2.4537, 'grad_norm': 19.646015167236328, 'learning_rate': 8.88888888888889e-07, 'epoch': 0.96}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                   
100%|██████████| 1250/1250 [08:31<00:00,  2.44it/s]

{'eval_loss': 2.6149892807006836, 'eval_accuracy': 0.15, 'eval_f1': 0.1148174003077631, 'eval_precision': 0.1205855721157312, 'eval_recall': 0.15, 'eval_runtime': 21.0982, 'eval_samples_per_second': 47.397, 'eval_steps_per_second': 5.925, 'epoch': 1.0}
{'train_runtime': 511.5661, 'train_samples_per_second': 19.548, 'train_steps_per_second': 2.443, 'train_loss': 4.01282431640625, 'epoch': 1.0}





TrainOutput(global_step=1250, training_loss=4.01282431640625, metrics={'train_runtime': 511.5661, 'train_samples_per_second': 19.548, 'train_steps_per_second': 2.443, 'total_flos': 2617922027520000.0, 'train_loss': 4.01282431640625, 'epoch': 1.0})

In [27]:
# Evaluate
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 125/125 [00:20<00:00,  6.03it/s]

Evaluation Results: {'eval_loss': 2.6149892807006836, 'eval_accuracy': 0.15, 'eval_f1': 0.1148174003077631, 'eval_precision': 0.1205855721157312, 'eval_recall': 0.15, 'eval_runtime': 20.8651, 'eval_samples_per_second': 47.927, 'eval_steps_per_second': 5.991, 'epoch': 1.0}





In [28]:
#peft_model.save_pretrained(peft_path)

In [30]:
inference_model = AutoPeftModelForSequenceClassification.from_pretrained(
    peft_path + '/checkpoint-1250',
    num_labels=len(unique_emojis)
)
inference_model.config.pad_token_id = inference_model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at D:\Python\LLM_Environment\models\gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
trainer = Trainer(
    model=inference_model,
    args=training_args,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [32]:
# Evaluate the model
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

100%|██████████| 2817/2817 [07:01<00:00,  6.68it/s]

Evaluation Results: {'eval_loss': 2.542323589324951, 'eval_model_preparation_time': 0.002, 'eval_accuracy': 0.1676133842194018, 'eval_f1': 0.12954224179975385, 'eval_precision': 0.1325410241609454, 'eval_recall': 0.1676133842194018, 'eval_runtime': 421.8097, 'eval_samples_per_second': 53.422, 'eval_steps_per_second': 6.678}





In [33]:
def predict(sentence: str) -> str:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inference_model.to(device)

    # Prepare the input text
    inputs = tokenizer(sentence, return_tensors="pt").to(device)

    # Get predictions
    with torch.no_grad():
        outputs = inference_model(**inputs)
        logits = outputs.logits

    probabilities = torch.nn.functional.softmax(logits, dim=1)
    predicted_class_id = probabilities.argmax().item()
    predicted_label = id2emoji[predicted_class_id]

    return predicted_label

In [34]:
# Example usage
sentence = "I'm sad and i wanna cry"
predicted_label = predict(sentence)
print(f"Sentence: '{sentence}'\nPredicted label: {predicted_label}")

Sentence: 'I'm sad and i wanna cry'
Predicted label: sob


In [35]:
sentence = "That was delicious!!"
predicted_label = predict(sentence)
print(f"Sentence: '{sentence}'\nPredicted label: {predicted_label}")

Sentence: 'That was delicious!!'
Predicted label: flushed


In [36]:
sentence = "it was yummy"
predicted_label = predict(sentence)
print(f"Sentence: '{sentence}'\nPredicted label: {predicted_label}")

Sentence: 'it was yummy'
Predicted label: sob


In [37]:
sentence = "I love you!"
predicted_label = predict(sentence)
print(f"Sentence: '{sentence}'\nPredicted label: {predicted_label}")

Sentence: 'I love you!'
Predicted label: yum


In [38]:
indices_for_review = [0, 1, 2, 3, 4]

for idx in indices_for_review:
    item = val_dataset[idx]


    print(item['tweet'][:100])
    actual_label_id = item['label']
    actual_label = id2emoji[actual_label_id]
    print(f'label:  {actual_label}')

    # Tokenize the text
    inputs = tokenizer(item['tweet'], return_tensors="pt").to(inference_model.device)

    with torch.no_grad():
        logits = inference_model(**inputs).logits

    predictions = torch.argmax(logits, dim=1).item()
    predicted_label = id2emoji[predictions]
    print(f'prediction: {predicted_label}\n')

i bet you know the feeling. this skirt just makes me happy! also, it's a pretty cool th
label:  relaxed
prediction: smirk

way ahead of you
label:  heart_eyes
prediction: yum

ight bet pooh
label:  yum
prediction: heart_eyes

working 30 hours next week is a bet
label:  heart_eyes
prediction: smirk

i thought i'd try this. bet i won't get more than... hmm.. 30?.. let's see
label:  wink
prediction: grin

