In [None]:
# Install required packages (uncomment if not already installed)
#pip install transformers datasets torch

# Mount Google Drive to access files stored there
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import pandas as pd
from torch import cuda
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments, set_seed)

# Set seed for reproducibility
set_seed(239)

# Check device (Colab should have GPU enabled if available)
device = 'cuda' if cuda.is_available() else 'cpu'
print("Using device:", device)


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# 1. Data Extraction from CSV file on Google Drive
csv_file_path = "/content/drive/MyDrive/news_bias.csv"  # Update with your CSV file path
df = pd.read_csv(csv_file_path)

print("Data preview:")
print(df.head())
print("Original label distribution:")
print(df['label'].value_counts(normalize=True))

# 1.1. Sanitize the dataset (example: strip and lowercase)
def sanitize_text(text):
    return text.strip().lower()

df['text'] = df['text'].apply(sanitize_text)

# 1.2. Merge labels: combine label 0 and 2 into 0, keep label 1 unchanged.
df['label'] = df['label'].apply(lambda x: 0 if x in [0, 2] else 1)

print("Modified label distribution after merging:")
print(df['label'].value_counts(normalize=True))

# 2. Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# 3. Split dataset: 70% train, 15% validation, and 15% test using seed 239.
split_dataset = dataset.train_test_split(test_size=0.3, seed=239)
val_test = split_dataset['test'].train_test_split(test_size=0.5, seed=239)
dataset_dict = DatasetDict({
    'train': split_dataset['train'],
    'validation': val_test['train'],
    'test': val_test['test']
})

print("Split sizes:")
print("Train size:", len(dataset_dict['train']))
print("Validation size:", len(dataset_dict['validation']))
print("Test size:", len(dataset_dict['test']))

# 4. Balance the training set to achieve a 50:50 ratio.
# Convert the training set to a pandas DataFrame for undersampling.
train_df = dataset_dict['train'].to_pandas()

# Separate the classes.
df_class0 = train_df[train_df['label'] == 0]
df_class1 = train_df[train_df['label'] == 1]

# Determine the minimum count among classes.
min_count = min(len(df_class0), len(df_class1))

# Undersample each class to the minimum count.
df_class0_under = df_class0.sample(n=min_count, random_state=239)
df_class1_under = df_class1.sample(n=min_count, random_state=239)

# Combine and shuffle to get the balanced training dataframe.
balanced_train_df = pd.concat([df_class0_under, df_class1_under]).sample(frac=1, random_state=239).reset_index(drop=True)

print("Balanced training label distribution:")
print(balanced_train_df['label'].value_counts(normalize=True))

# Convert the balanced training DataFrame back to a Hugging Face Dataset.
balanced_train_dataset = Dataset.from_pandas(balanced_train_df)

# Replace the training set in dataset_dict with the balanced training set.
dataset_dict['train'] = balanced_train_dataset

print("Final split sizes after balancing training set:")
print("Train size:", len(dataset_dict['train']))
print("Validation size:", len(dataset_dict['validation']))
print("Test size:", len(dataset_dict['test']))


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Pass the compute_metrics function
)

test_results = trainer.evaluate(tokenized_datasets["test"])
print("Test evaluation results:")
print(test_results)


In [None]:
# Export the model and tokenizer as shown before:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

checkpoint_path = "results/checkpoint-755"
export_path = "exported_model"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

model.save_pretrained(export_path)
tokenizer.save_pretrained(export_path)

print("Model and tokenizer have been exported to", export_path)

# Compress the exported model directory into a ZIP file
!zip -r exported_model.zip exported_model

# For Google Colab, use the files module to download the ZIP file
from google.colab import files
files.download("exported_model.zip")
