# Training two models for creating an ensemble model


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/redmi-6-pro/redmi6.csv


In [3]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/kaggle/input/redmi-6-pro/redmi6.csv', encoding='cp1252')

# If your data is from a text file like shown earlier, you can use:


In [4]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient() 

personal_key_for_api = user_secrets.get_secret("wandb-key")

! wandb login $personal_key_for_api

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

class SentimentFineTuner:
    def __init__(self, model_name="roberta-base"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Initialize tokenizer and model
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        self.model = RobertaForSequenceClassification.from_pretrained(
            model_name,
            num_labels=3  # For negative, neutral, positive
        ).to(self.device)
        
    def prepare_data(self, csv_file):
        """Load and prepare data for training"""
        # Try different encodings to load the CSV
        try:
            df = pd.read_csv(csv_file, encoding='utf-8')
        except UnicodeDecodeError:
            try:
                df = pd.read_csv(csv_file, encoding='cp1252')
            except UnicodeDecodeError:
                df = pd.read_csv(csv_file, encoding='latin-1')
        
        # Convert ratings to labels (0: negative, 1: neutral, 2: positive)
        df['label'] = df['Rating'].apply(
            lambda x: 2 if float(x.split()[0]) >= 4 
            else 1 if float(x.split()[0]) >= 3 
            else 0
        )
        
        # Split data into train and validation sets
        train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
        
        # Convert to datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        
        # Tokenize datasets
        train_dataset = train_dataset.map(
            self._tokenize_function,
            batched=True,
            remove_columns=train_dataset.column_names
        )
        val_dataset = val_dataset.map(
            self._tokenize_function,
            batched=True,
            remove_columns=val_dataset.column_names
        )
        
        return train_dataset, val_dataset
    
    def _tokenize_function(self, examples):
        """Tokenize the texts and include labels in the output"""
        tokenized_output = self.tokenizer(
            examples['Comments'],
            padding='max_length',
            truncation=True,
            max_length=512
        )
        tokenized_output['label'] = examples['label']  # Add the label here
        return tokenized_output
    
    def _compute_metrics(self, eval_pred):
        """Compute evaluation metrics"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, 
            predictions, 
            average='weighted'
        )
        accuracy = accuracy_score(labels, predictions)
        
        return {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
    
    def train(self, train_dataset, val_dataset, output_dir="./roberta_sentiment"):
        """Fine-tune the model"""
        # Define training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            learning_rate=2e-5,
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=self._compute_metrics,
            data_collator=DataCollatorWithPadding(self.tokenizer),
        )
        
        # Train the model
        print("Starting training...")
        trainer.train()
        
        # Save the model
        print(f"Saving model to {output_dir}")
        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)
        
        # Evaluate the model
        print("Evaluating model...")
        eval_results = trainer.evaluate()
        print(f"Evaluation results: {eval_results}")
        
        return trainer, eval_results

# Example usage

# Initialize fine-tuner
fine_tuner = SentimentFineTuner()

# Prepare data
train_dataset, val_dataset = fine_tuner.prepare_data('/kaggle/input/redmi-6-pro/redmi6.csv')

# Train model
trainer, eval_results = fine_tuner.train(
    train_dataset, 
    val_dataset,
    output_dir='./roberta_smartphone_sentiment'
)


Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/224 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]



Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1091,1.096528,0.410714,0.460867,0.702756,0.410714
2,1.0684,1.047843,0.785714,0.691429,0.617347,0.785714
3,0.9776,0.955018,0.785714,0.691429,0.617347,0.785714


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Saving model to ./roberta_smartphone_sentiment
Evaluating model...


Evaluation results: {'eval_loss': 1.0478427410125732, 'eval_accuracy': 0.7857142857142857, 'eval_f1': 0.6914285714285714, 'eval_precision': 0.6173469387755102, 'eval_recall': 0.7857142857142857, 'eval_runtime': 0.8668, 'eval_samples_per_second': 64.603, 'eval_steps_per_second': 8.075, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from datasets import load_dataset
ds = load_dataset("Yelp/yelp_review_full")


In [None]:
def reduce_dataset(dataset, fraction=0.1):
    """
    Reduce the dataset size by taking a fraction of the original data
    """
    for split in dataset.keys():
        # Calculate the new size
        new_size = int(len(dataset[split]) * fraction)
        # Select the first new_size examples
        dataset[split] = dataset[split].select(range(new_size))
    return dataset


In [None]:
ds = reduce_dataset(ds, fraction=0.1)

In [None]:
ds["train"][0]

In [None]:
def transform_ratings_to_sentiment(dataset):
    """
    Transform star ratings into sentiment categories and encode them

    Args:
        dataset: A Hugging Face dataset with a 'label' column containing star ratings

    Returns:
        The dataset with transformed and encoded sentiment labels
    """
    def map_stars_to_sentiment(example):
        rating = example['label']

        # Handle different possible formats of star ratings
        if isinstance(rating, str):
            rating = rating.lower().strip()
            if '1 star' in rating or '2 stars' in rating:
                sentiment = 'negative'
                encoded_sentiment = 0
            elif '3 stars' in rating:
                sentiment = 'neutral'
                encoded_sentiment = 1
            elif '4 stars' in rating or '5 stars' in rating:
                sentiment = 'positive'
                encoded_sentiment = 2
            else:
                sentiment = 'unknown'
                encoded_sentiment = -1
        elif isinstance(rating, (int, float)):
            if rating <= 2:
                sentiment = 'negative'
                encoded_sentiment = 0
            elif rating == 3:
                sentiment = 'neutral'
                encoded_sentiment = 1
            elif rating >= 4:
                sentiment = 'positive'
                encoded_sentiment = 2
            else:
                sentiment = 'unknown'
                encoded_sentiment = -1
        else:
            sentiment = 'unknown'
            encoded_sentiment = -1

        example['sentiment'] = sentiment
        example['encoded_sentiment'] = encoded_sentiment
        return example

    # Apply the transformation to the dataset
    transformed_dataset = dataset.map(map_stars_to_sentiment)

    # Create a mapping dictionary for reference
    sentiment_encoding = {
        'negative': 0,
        'neutral': 1,
        'positive': 2,
        'unknown': -1
    }

    print("Sentiment encoding mapping:")
    for sentiment, code in sentiment_encoding.items():
        print(f"{sentiment}: {code}")

    return transformed_dataset

In [None]:
tr_ds = transform_ratings_to_sentiment(ds)

In [None]:
tr_ds = tr_ds.remove_columns("label")

In [None]:
tr_ds = tr_ds.remove_columns("sentiment")

In [None]:
tr_ds = tr_ds.rename_column("encoded_sentiment", "labels")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler, AdamW, DataCollatorWithPadding
from datasets import load_dataset, Dataset, DatasetDict
from torch.utils.data import DataLoader
import torch
from tqdm.auto import tqdm


In [None]:
checkpoint1 = "roberta-base"

tokenizer1 = AutoTokenizer.from_pretrained(checkpoint1)
def tokenize_function1(examples):
    return tokenizer1(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors=None  # This ensures we get a dictionary of lists
    )

tokenized_dataset1 = tr_ds.map(tokenize_function1, batched=True)


In [None]:
tokenized_dataset1 =tokenized_dataset1.remove_columns(["text"])
tokenized_dataset1.set_format("torch")
tokenized_dataset1["train"].column_names


In [None]:
data_collator1 = DataCollatorWithPadding(tokenizer=tokenizer1)

train_dataloader1 = DataLoader(
    tokenized_dataset1["train"], shuffle=True, batch_size=8, collate_fn=data_collator1
)

test_dataloader1 = DataLoader(
    tokenized_dataset1["test"], batch_size =8, collate_fn=data_collator1
)

In [None]:
for batch in train_dataloader1:
  break
{k: v.shape for k, v in batch.items()}

In [None]:
model1 = AutoModelForSequenceClassification.from_pretrained(checkpoint1, num_labels=3)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model1.to(device)
device


In [None]:
optimizer = AdamW(model1.parameters(), lr=5e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader1)
lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps=0,
    num_training_steps = num_training_steps,

)
print(num_training_steps)

In [None]:
progress_bar = tqdm(range(num_training_steps))

model1.train()
for epoch in range(num_epochs):
  for batch in train_dataloader1:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model1(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

In [None]:
model1.save_pretrained("./saved_model1")

In [None]:
!pip install evaluate
import evaluate
metric = evaluate.load("accuracy")
model1.eval()
for batch in test_dataloader1:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

