# Declaring Model to be used

In [1]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import pipeline

model_id = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=3, # number of moods i have or possible outcomes
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Importing Datasets 

In [2]:
from datasets import load_dataset
data_files={
    "train":"sentiment_analysis_train.jsonl",
    "validation":"sentiment_analysis_validation.jsonl",
    "test":"sentiment_analysis_test.jsonl"
}


dataset = load_dataset("json", data_files=data_files)


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [3]:
dataset["test"] = dataset["test"].filter(
    lambda example: isinstance(example["text"], str) and example["text"].strip() != ""
)


Filter:   0%|          | 0/5206 [00:00<?, ? examples/s]

# Tokenizes the Dataset to be fineTuned on by Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize_function(examples):
    # Initialize a list to store tokenized results
    tokenized_results = []
    
    for text in examples['text']:
        try:
            
            tokenized_results.append(tokenizer(text, padding='max_length', truncation=True,max_length=32))
        except Exception as e:
            # Print the problematic text and continue with the next one
            print(f"Error tokenizing: {text} - Error: {e}")
            tokenized_results.append(None)  # Append None for problematic cases

    # Return the tokenized results
    return {"input_ids": [result['input_ids'] if result is not None else [] for result in tokenized_results],
            "attention_mask": [result['attention_mask'] if result is not None else [] for result in tokenized_results]}


tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Filter out any examples that were returned as empty dictionaries 
tokenized_datasets = tokenized_datasets.filter(lambda example: example != {})

tokenized_datasets = tokenized_datasets.rename_columns({"label": "labels"})  # Ensure the label column is named "labels"


Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5205 [00:00<?, ? examples/s]

# Setting up Trainer model with Respective Arguments

- change num_train_epochs to get better tuned model but also more storage required , higher epochs =  better
- train_batch and eval_batch needs to be changed at the same proportionality
- Incase storage issue with results as it stores the checkpoints, do the following
    - save_total_limit=1,         # keep only the last checkpoint 
    - save_strategy="epoch",      # save only at the end of each epoch
    - save_strategy="no",         # not recommended unless you are debugging
    - You can also just delete the checkpoints manually after model creation 


In [None]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory for model checkpoints
    # evaluation_strategy="epoch",     # evaluation strategy to use
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    num_train_epochs=1,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            
    
)

# Define the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=tokenized_datasets["train"],  
    eval_dataset=tokenized_datasets["validation"],  
)


In [39]:
trainer.train()

Step,Training Loss
500,0.3764
1000,0.3346
1500,0.3327
2000,0.7315
2500,0.6331
3000,0.6377
3500,0.6611
4000,0.6021
4500,0.5744
5000,0.5792


TrainOutput(global_step=7808, training_loss=0.5596007225943394, metrics={'train_runtime': 766.1117, 'train_samples_per_second': 40.767, 'train_steps_per_second': 10.192, 'total_flos': 513597391405056.0, 'train_loss': 0.5596007225943394, 'epoch': 1.0})

In [6]:
trainer.evaluate(tokenized_datasets["validation"])

{'eval_loss': 0.7458823323249817,
 'eval_model_preparation_time': 0.0029,
 'eval_runtime': 10.6627,
 'eval_samples_per_second': 488.149,
 'eval_steps_per_second': 61.054}

In [7]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.7410308718681335,
 'eval_model_preparation_time': 0.0029,
 'eval_runtime': 10.6851,
 'eval_samples_per_second': 487.129,
 'eval_steps_per_second': 60.926}

# Saving Model for later use via pipelines

In [48]:
trainer.save_model("my_sentiment_model")
tokenizer.save_pretrained("my_sentiment_model")


('my_sentiment_model/tokenizer_config.json',
 'my_sentiment_model/special_tokens_map.json',
 'my_sentiment_model/vocab.json',
 'my_sentiment_model/merges.txt',
 'my_sentiment_model/added_tokens.json',
 'my_sentiment_model/tokenizer.json')

# Using created Model via Pipeline

In [8]:
from transformers import pipeline

pipe = pipeline("text-classification", model="my_sentiment_model", tokenizer=tokenizer)
prompt=input("Enter text to check sentiment of :")
print(prompt)
print(pipe(prompt))


Device set to use cuda:0


i wonder how my test will go
[{'label': 'neutral', 'score': 0.9947810769081116}]
