In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

In [20]:
data = pd.read_csv("/kaggle/input/twitter-sentiment-analysis/twitter_cleaned_data.csv")  # Replace with your actual data path



In [21]:
data.dropna(inplace=True)

In [22]:
data = data.rename(columns={"category": "label"})
data['label'] = data['label'].astype(int)



In [23]:
data['label'] = data['label'].replace({-1: 0, 0: 1, 1: 2})

print("Unique labels in dataset:", data['label'].unique())



Unique labels in dataset: [0 1 2]


In [24]:
data = data.sample(n=30000, random_state=42)

In [25]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['cleaned_data'], padding="max_length", truncation=True)





In [26]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print("Training dataset format:", train_dataset)
print("Testing dataset format:", test_dataset)



Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Training dataset format: Dataset({
    features: ['label', 'cleaned_data', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 24000
})
Testing dataset format: Dataset({
    features: ['label', 'cleaned_data', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6000
})


In [27]:
num_labels = len(train_df['label'].unique())

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluation strategy to adopt during training
    run_name='sentiment_analysis_run'  # Specify a unique run name
)

# import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)

trainer.train()



  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,0.568,0.482739,0.832333
2,0.4577,0.40169,0.862667
3,0.2612,0.432241,0.861


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=2250, training_loss=0.4616586784786648, metrics={'train_runtime': 3584.719, 'train_samples_per_second': 20.085, 'train_steps_per_second': 0.628, 'total_flos': 1.8944166076416e+16, 'train_loss': 0.4616586784786648, 'epoch': 3.0})

In [29]:
trainer.evaluate()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 0.4322410523891449,
 'eval_accuracy': 0.861,
 'eval_runtime': 87.5455,
 'eval_samples_per_second': 68.536,
 'eval_steps_per_second': 0.537,
 'epoch': 3.0}

In [30]:
import pickle

with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [31]:
trainer.save_model("./checkpoint")  # Save the checkpoint in a directory


In [32]:
trainer = Trainer(
    model=model.from_pretrained("./checkpoint"),  # Load from the saved checkpoint
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)