In [None]:
# !pip install transformers datasets trl torch pandas scikit-learn accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Data Preprocessing

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('./data/Tweets.csv')

# 1. Select relevant columns
df = df[['text', 'airline_sentiment']]

In [19]:
df.head()

Unnamed: 0,text,airline_sentiment,labels
0,What said.,neutral,1
1,plus you've added commercials to the experienc...,positive,2
2,I didn't today... Must mean I need to take ano...,neutral,1
3,"it's really aggressive to blast obnoxious ""ent...",negative,0
4,and it's a really big bad thing about it,negative,0


In [None]:
# 2. Map sentiment labels to integers
# We'll use: 0 = negative, 1 = neutral, 2 = positive
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['labels'] = df['airline_sentiment'].map(label_map)

In [None]:
# 3. Clean the text
def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # Remove @mentions
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = text.strip() # Remove leading/trailing whitespace
    return text

df['text'] = df['text'].apply(clean_text)

# Drop rows where sentiment mapping might have failed (if any)
df = df.dropna(subset=['labels'])
df['labels'] = df['labels'].astype(int)

In [None]:
# 4. Split the data into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['labels']  # Important for imbalanced datasets!
)

print(f"Original dataset shape: {df.shape}")
print(f"Training dataset shape: {train_df.shape}")
print(f"Testing dataset shape: {test_df.shape}")

print("\nTraining set sentiment distribution:")
print(train_df['airline_sentiment'].value_counts(normalize=True))

print("\nTesting set sentiment distribution:")
print(test_df['airline_sentiment'].value_counts(normalize=True))

In [None]:
# 5. Save the processed data to files for the next step
train_df.to_csv('./data/train_processed.csv', index=False)
test_df.to_csv('./data/test_processed.csv', index=False)

print("\n'train_processed.csv' and 'test_processed.csv' have been created in the './data/' directory.")

Original dataset shape: (14640, 3)
Training dataset shape: (11712, 3)
Testing dataset shape: (2928, 3)

Training set sentiment distribution:
airline_sentiment
negative    0.626964
neutral     0.211663
positive    0.161373
Name: proportion, dtype: float64

Testing set sentiment distribution:
airline_sentiment
negative    0.626708
neutral     0.211749
positive    0.161544
Name: proportion, dtype: float64

'train_processed.csv' and 'test_processed.csv' have been created.


## Create the Baseline Model

In [None]:
import pandas as pd
import numpy as np
import datasets
import torch  
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

# Auto-detect GPU (CUDA or Apple Silicon 'mps') ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU: Using CUDA (NVIDIA GPU)")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("GPU: Using MPS (Apple Silicon GPU)")
else:
    device = torch.device("cpu")
    print("GPU: Using CPU")



# --- 1. Load Preprocessed Data ---
try:
    train_df = pd.read_csv('./data/train_processed.csv')
    test_df = pd.read_csv('./data/test_processed.csv')
except FileNotFoundError:
    print("Error: 'train_processed.csv' or 'test_processed.csv' not found.")
    print("Please ensure Step 2 ran successfully.")
    exit()

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

print(f"Loaded train dataset with {len(train_dataset)} examples.")
print(f"Loaded test dataset with {len(test_dataset)} examples.")

# --- 2. Load Tokenizer and Tokenize Data ---
MODEL_CHECKPOINT = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

print("\nTokenizing datasets...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text', 'airline_sentiment'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['text', 'airline_sentiment'])
print("Tokenization complete.")

# --- 3. Load Pre-trained Model ---
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
id2label = {id: label for id, label in label_map.items()}
label2id = {label: id for id, label in label_map.items()}

print(f"\nLoading model: {MODEL_CHECKPOINT}")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
).to(device)  

# --- 4. Define Evaluation Metrics ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Logits might be on GPU, move to CPU for numpy
    predictions = np.argmax(logits.cpu(), axis=-1)
    labels = labels.cpu()
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# --- 5. Set Up the Trainer ---
print("Configuring Trainer...")

# Note: We fixed 'eval_strategy' in our previous step
training_args = TrainingArguments(
    output_dir="./results-baseline",
    num_train_epochs=3,
    per_device_train_batch_size=16,      # Batch size per device
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs-baseline',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# --- 6. Run Training & Evaluation ---
print("\nStarting baseline model training...")
trainer.train()

print("\nTraining complete. Evaluating model on the test set...")
eval_results = trainer.evaluate()

print("\n--- Baseline Model Evaluation Results ---")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"F1-score (Weighted): {eval_results['eval_f1']:.4f}")
print(eval_results)

# Save your final model and tokenizer
print("\nSaving final model to './final-baseline-model'")
trainer.save_model("./final-baseline-model")
tokenizer.save_pretrained("./final-baseline-model")

Loaded train dataset with 11712 examples.
Loaded test dataset with 2928 examples.

Tokenizing datasets...


Map: 100%|██████████| 11712/11712 [00:00<00:00, 21357.26 examples/s]
Map: 100%|██████████| 2928/2928 [00:00<00:00, 21830.85 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Tokenization complete.

Loading model: distilbert-base-uncased
Configuring Trainer...

Starting baseline model training...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4597,0.463838,0.820697,0.818803
2,0.3369,0.460359,0.839822,0.836332
3,0.1626,0.602882,0.840505,0.839446





Training complete. Evaluating model on the test set...





--- Baseline Model Evaluation Results ---
Accuracy: 0.8405
F1-score (Weighted): 0.8394
{'eval_loss': 0.602882444858551, 'eval_accuracy': 0.8405054644808743, 'eval_f1': 0.8394463938652994, 'eval_runtime': 35.2944, 'eval_samples_per_second': 82.959, 'eval_steps_per_second': 1.303, 'epoch': 3.0}

Saving final model to './final-baseline-model'


('./final-baseline-model/tokenizer_config.json',
 './final-baseline-model/special_tokens_map.json',
 './final-baseline-model/vocab.txt',
 './final-baseline-model/added_tokens.json',
 './final-baseline-model/tokenizer.json')