In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
import joblib  # For saving the label encoder

# Step 1: Load and clean the dataset
df = pd.read_csv('./DATA/airline_tweets.csv')
df = df[['text', 'airline_sentiment']]

# Optional: Clean text by removing first word (if needed)
df['text'] = [' '.join(review.split(' ')[1:]) for review in df['text']]

# Step 2: Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['airline_sentiment'])

# Step 3: Tokenizer and model name
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 4: Create custom Dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Step 5: Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

# Step 6: Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 7: Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Step 8: Training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=10,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     logging_dir="./logs",
#     logging_steps=10,
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy"
# )

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=10
)


# Step 9: Compute metrics
def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Step 10: Train
if __name__ == "__main__":
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()

    # Save model, tokenizer, and label encoder
    save_path = "saved_model_label"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    joblib.dump(label_encoder, f"{save_path}/label_encoder.pkl")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
10,0.9538
20,0.8049
30,0.6712
40,0.5401
50,0.5948
60,0.583
70,0.4315
80,0.6421
90,0.6349
100,0.5257


In [None]:
import transformers
print(transformers.__version__)


In [None]:
from transformers import TrainingArguments
print(TrainingArguments.__module__)


In [None]:
import transformers
print(transformers.__file__)


In [None]:
import transformers
print(transformers.__version__)


In [None]:
import transformers
from transformers import TrainingArguments
import sys
!{sys.executable} -m pip install --upgrade transformers

In [None]:
import sys
import subprocess

python_path = sys.executable
print("Using Python from:", python_path)

# Run a subprocess to check transformers version
subprocess.run([python_path, "-c", "import transformers; print(transformers.__version__)"])


In [None]:
import sys
print(sys.executable)


In [None]:
import subprocess
import sys

# Get the path of the current Python executable
python_path = sys.executable

# Use subprocess to check transformers version
subprocess.run([python_path, "-c", "import transformers; print(transformers.__version__)"])


In [None]:
import sys
import transformers

# Get the Python executable path (for the active environment)
print("Using Python from:", sys.executable)

# Get the transformers version
print("Transformers version:", transformers.__version__)
