In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import os

!gdown --id 1Wi0u9NSUPHo9SQVhvDBMv-i4HysEUcQb

# Set up Google Drive path for saving and loading model checkpoints
from google.colab import drive
drive.mount('/content/drive')
model_save_path = '/content/drive/MyDrive/model_res_larger_model'
os.makedirs(model_save_path, exist_ok=True)

# model_save_path = 'res_model'


# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your dataset
data = pd.read_csv('train_data_larger_sample.csv')

# Split dataset with stratification and shuffling
train_data, val_data = train_test_split(
    data, test_size=0.2, stratify=data['overall'], shuffle=True, random_state=42
)

# Load the pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('ashok2216/gpt2-amazon-sentiment-classifier-V1.0')
model = AutoModelForSequenceClassification.from_pretrained('ashok2216/gpt2-amazon-sentiment-classifier-V1.0')

# Configure model for 5 classes and single-label classification
model.config.num_labels = 5
model.config.problem_type = "single_label_classification"

# Replace classifier with a new 5-class output layer
in_features = model.classifier.in_features
model.classifier = torch.nn.Linear(in_features, 5)

# Freeze all layers except the classifier
for param in model.parameters():
    param.requires_grad = False
for param in model.classifier.parameters():
    param.requires_grad = True

# Move model to the device (GPU if available)
model.to(device)

# Custom dataset class
class AmazonDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = self.data.iloc[idx]['reviewText']
        label = self.data.iloc[idx]['overall'] - 1

        # Tokenize the text
        inputs = self.tokenizer(
            review,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )

        item = {key: val.squeeze() for key, val in inputs.items()}
        item['labels'] = torch.tensor(label)
        return item

# Create Datasets and DataLoaders
train_dataset = AmazonDataset(train_data, tokenizer)
val_dataset = AmazonDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

# Loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20, verbose=True)

# Training loop with batch count
num_epochs = 2
start_epoch = 0

# Load checkpoint if exists
checkpoint_path = os.path.join(model_save_path, "Copy of checkpoint.pth")
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f"Checkpoint loaded. Resuming training from epoch {start_epoch + 1}")
else:
    print("No checkpoint found. Starting training from scratch.")

# Function to calculate accuracy
def calculate_accuracy(logits, labels):
    preds = torch.argmax(logits, dim=1)
    return (preds == labels).float().mean().item()

# Training and validation loop
for epoch in range(start_epoch, num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    model.train()
    train_loss, train_acc = 0, 0

    for batch_idx, batch in enumerate(train_loader):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs)
        logits = outputs.logits

        # Calculate loss and accuracy
        loss = loss_fn(logits, labels)
        acc = calculate_accuracy(logits, labels)
        train_loss += loss.item()
        train_acc += acc

        print(f"Train Epoch [{epoch + 1}], Batch [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item()}, Accuracy: {acc}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss, val_acc = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs)
            logits = outputs.logits

            # Calculate validation loss and accuracy
            loss = loss_fn(logits, labels)
            acc = calculate_accuracy(logits, labels)
            val_loss += loss.item()
            val_acc += acc

    # Log averages for the epoch
    train_loss /= len(train_loader)
    train_acc /= len(train_loader)
    val_loss /= len(val_loader)
    val_acc /= len(val_loader)

    print(f"Epoch [{epoch + 1}], Train Loss: {train_loss}, Train Accuracy: {train_acc}")
    print(f"Epoch [{epoch + 1}], Validation Loss: {val_loss}, Validation Accuracy: {val_acc}")

    # Adjust learning rate if no improvement
    scheduler.step(train_loss)

    # Save checkpoint
    checkpoint_filename = f"checkpoint_epoch2.pth"
    checkpoint_path = os.path.join(model_save_path, checkpoint_filename)
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_filename}")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os

import pandas as pd

!gdown --id 1-030IdL3oQhqC_JSkZUPGQwkWux74bV4

test = pd.read_csv('test_data_final.csv')

# Set up Google Drive path for saving and loading model checkpoints
from google.colab import drive
drive.mount('/content/drive')
model_save_path = '/content/drive/MyDrive'
checkpoint_path = os.path.join(model_save_path, "Copy of checkpoint_epoch2.pth")

# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('ashok2216/gpt2-amazon-sentiment-classifier-V1.0')
model = AutoModelForSequenceClassification.from_pretrained('ashok2216/gpt2-amazon-sentiment-classifier-V1.0')

# Configure model for 5 classes and single-label classification
model.config.num_labels = 5
model.config.problem_type = "single_label_classification"  # Ensure correct problem type

# Replace classifier with a new 5-class output layer
in_features = model.classifier.in_features
model.classifier = torch.nn.Linear(in_features, 5)

# Load the saved model checkpoint
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    print("Checkpoint loaded. Model is ready for prediction.")
else:
    print("No checkpoint found. Ensure the model was trained and saved.")

# Move model to the device (GPU if available)
model.to(device)
model.eval()  # Set model to evaluation mode

# Custom dataset class for prediction
class AmazonTestDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = self.data.iloc[idx]['reviewText']

        # Tokenize the text
        inputs = self.tokenizer(
            review,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )

        # Return tokenized inputs
        item = {key: val.squeeze() for key, val in inputs.items()}
        return item

# Load your test data as a DataFrame (ensure it has a 'reviewText' column)
test = test[['reviewText']]

# Create the test dataset and dataloader
test_dataset = AmazonTestDataset(test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=1)  # Batch size 1 for prediction

# Prediction function
def predict(model, dataloader):
    predictions = []
    with torch.no_grad():  # Disable gradient computation for faster inference
        for batch in dataloader:
            # Move inputs to the device
            inputs = {key: val.to(device) for key, val in batch.items()}

            # Forward pass to get logits
            outputs = model(**inputs)
            logits = outputs.logits

            # Get the predicted class (highest logit value)
            predicted_class = torch.argmax(logits, dim=1).item()
            predictions.append(predicted_class + 1)  # Convert to one-based label

    return predictions

# Run prediction on test data
predictions = predict(model, test_dataloader)

# Add predictions to the test DataFrame
test['PredictedRating'] = predictions
print(test)

In [None]:
# Create a DataFrame with the predictions
submission_df = pd.DataFrame(predictions, columns=['predicted'])

# Save the DataFrame to a CSV file in the required format
submission_df.to_csv('q2_submission.csv', index=False)

In [None]:
# Define the path in Google Drive where you want to save the file
save_path = '/content/drive/MyDrive/q2_submission.csv'

# Save the DataFrame to Google Drive
submission_df.to_csv(save_path, index=False)

print(f"File saved to {save_path}")