In [14]:
%pip install optuna datasets scikit-learn matplotlib pandas torch transformers imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\ekagr\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [29]:
import optuna
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score, precision_score, recall_score
from imblearn.over_sampling import RandomOverSampler
import re
import numpy as np
import random

In [16]:
# Load the dataset
dataset = pd.read_csv('twitter_training.csv')

# Rename columns to match required format
dataset.columns = ['ID', 'Feature', 'Sentiment', 'Text']
dataset = dataset[['Text', 'Sentiment']]

# Drop rows with missing values
dataset = dataset.dropna(subset=['Text', 'Sentiment'])

# Map labels to integers
label_mapping = {'Positive': 2, 'Neutral': 1, 'Negative': 0, 'Irrelevant': 3}
dataset['Sentiment'] = dataset['Sentiment'].map(label_mapping)

# Clean text data
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)    # Remove mentions
    text = re.sub(r"#\w+", "", text)    # Remove hashtags
    text = re.sub(r"\d+", "", text)     # Remove numbers
    text = re.sub(r"[^\w\s]", "", text) # Remove punctuation
    text = text.lower().strip()         # Convert to lowercase and strip whitespace
    return text

dataset['Text'] = dataset['Text'].apply(clean_text)

In [17]:
dataset.head()


Unnamed: 0,Text,Sentiment
0,i am coming to the borders and i will kill you...,2
1,im getting on borderlands and i will kill you all,2
2,im coming on borderlands and i will murder you...,2
3,im getting on borderlands and i will murder y...,2
4,im getting into borderlands and i can murder y...,2


In [18]:
# 2 - Positive
# 1 - Neutral
# 0 - Negative
#-1 - Irrelevant
print(dataset['Sentiment'].value_counts())


Sentiment
0    22358
2    20654
1    18108
3    12875
Name: count, dtype: int64


In [19]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenized_data = tokenizer(list(dataset['Text']), padding=True, truncation=True, max_length=128, return_tensors="pt")

# Convert labels to tensor
labels = torch.tensor(dataset['Sentiment'].tolist())

loading file vocab.txt from cache at C:\Users\ekagr\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\ekagr\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\tokenizer_config.json
loading file tokenizer.json from cache at C:\Users\ekagr\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\tokenizer.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at C:\Users\ekagr\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\12040accade4e8a0f71eabdb258fecc2e7e948be\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures

In [None]:
# Convert tokenized data to numpy arrays for oversampling
input_ids = tokenized_data['input_ids'].numpy()
labels_np = labels.numpy()

# Apply Random Oversampling
ros = RandomOverSampler(random_state=42)
input_ids_resampled, labels_resampled = ros.fit_resample(input_ids, labels_np)




In [None]:
# Convert back to tensors
input_ids_resampled = torch.tensor(input_ids_resampled)
labels_resampled = torch.tensor(labels_resampled)

# Define Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5)

Splitting Dataset


train_texts, test_texts, train_labels, test_labels = train_test_split(
    tokenized_data['input_ids'], labels, test_size=0.2, random_state=42
)

In [None]:
# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted'),
        'precision': precision_score(labels, predictions, average='weighted'),
        'recall': recall_score(labels, predictions, average='weighted')
    }


In [None]:

# Define the objective function for Bayesian Optimization
def objective(trial):
    # Define the hyperparameter space
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 4)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16])
    weight_decay = trial.suggest_loguniform('weight_decay', 0.01, 0.1)

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy="steps",
        save_strategy="steps",
        save_total_limit=3,
        save_steps=500,
        eval_steps=500,
        logging_dir='./logs',
        logging_steps=100,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=8,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        gradient_accumulation_steps=2,
        no_cuda=True,
        fp16=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        push_to_hub=False,
        report_to=["tensorboard"],
        log_level="info",
    )

    # Cross-Validation
    eval_losses = []
    for train_index, val_index in skf.split(input_ids_resampled, labels_resampled):
        train_texts, val_texts = input_ids_resampled[train_index], input_ids_resampled[val_index]
        train_labels, val_labels = labels_resampled[train_index], labels_resampled[val_index]

        # Convert to Dataset
        train_dataset = Dataset.from_dict({"input_ids": train_texts, "labels": train_labels})
        val_dataset = Dataset.from_dict({"input_ids": val_texts, "labels": val_labels})

        # Define the model
        model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

        # Define the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        # Train the model
        trainer.train()
        eval_results = trainer.evaluate()
        eval_losses.append(eval_results['eval_loss'])

    return np.mean(eval_losses)

Running Bayesian Optimization for hyperparameter training


In [None]:
# Create a study and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

# Get the best hyperparameters
best_hyperparams = study.best_params
print(best_hyperparams)

Training final model with Best Hyperparameters

In [26]:

# Train the final model with the best hyperparameters
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=3,
    save_steps=500,
    eval_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=best_hyperparams['learning_rate'],
    per_device_train_batch_size=best_hyperparams['per_device_train_batch_size'],
    per_device_eval_batch_size=8,
    num_train_epochs=best_hyperparams['num_train_epochs'],
    weight_decay=best_hyperparams['weight_decay'],
    gradient_accumulation_steps=2,
    no_cuda=True,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    push_to_hub=False,
    report_to=["tensorboard"],
    log_level="info",
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

NameError: name 'best_hyperparams' is not defined

In [None]:
trainer.train()
trainer.evaluate()

from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

# Calculate class weights
class_counts = dataset['Sentiment'].value_counts().to_dict()
total_samples = len(dataset)
class_weights = {cls: total_samples/count for cls, count in class_counts.items()}
weights = torch.tensor([class_weights[label] for label in dataset['Sentiment']])

# Define model with class weights
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
model.config.class_weights = weights

# Convert the datasets to the appropriate format
train_dataset = Dataset.from_dict({"input_ids": train_texts, "labels": train_labels})
eval_dataset = Dataset.from_dict({"input_ids": test_texts, "labels": test_labels})

# Sample a subset for evaluation
eval_dataset_list = list(eval_dataset)
sampled_eval_dataset = random.sample(eval_dataset_list, 4500)
eval_dataset = Dataset.from_list(sampled_eval_dataset)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=3,
    save_steps=500,
    eval_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    no_cuda=True,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    push_to_hub=False,
    report_to=["tensorboard"],
    log_level="info",
)

# Define Trainer with custom loss function
def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss_fct = torch.nn.CrossEntropyLoss(weight=model.config.class_weights)
    loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    compute_loss=compute_loss
)

def objective(trial):
    # Define the hyperparameter space
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 4)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [8, 16])
    weight_decay = trial.suggest_loguniform('weight_decay', 0.01, 0.1)

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy="steps",
        save_strategy="steps",
        save_total_limit=3,
        save_steps=500,
        eval_steps=500,
        logging_dir='./logs',
        logging_steps=100,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=8,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        gradient_accumulation_steps=2,
        no_cuda=True,
        fp16=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        push_to_hub=False,
        report_to=["tensorboard"],
        log_level="info",
    )

    # Define the model
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

    # Define the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results['eval_loss']


In [None]:
# Evaluate on the full evaluation dataset
finaleval = Dataset.from_list(eval_dataset_list)
trainer.evaluate(eval_dataset=finaleval)



***** Running Evaluation *****
  Num examples = 14799
  Batch size = 16

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 1.3920329809188843,
 'eval_model_preparation_time': 0.002,
 'eval_accuracy': 0.22886681532535982,
 'eval_f1': 0.14547834090792128,
 'eval_precision': 0.15042983753245157,
 'eval_recall': 0.22886681532535982,
 'eval_runtime': 1227.6996,
 'eval_samples_per_second': 12.054,
 'eval_steps_per_second': 0.753}

In [None]:
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")