In [1]:
import pandas as pd
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import torch
import json
import os
from peft import LoraConfig, get_peft_model
from datasets import Dataset




In [2]:
# Load data from Excel file 
df = pd.read_csv("500_review.csv")
test_df = pd.read_csv("test_review.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,rating,reviewText,summary,review,word_count
0,0,5,This book was the very first bookmobile book I...,50 + years ago...,Rating: 5 Title: 50 + years ago... Content: Th...,69
1,3,5,I don't normally buy 'mystery' novels because ...,Very good read.,Rating: 5 Title: Very good read. Content: I do...,73
2,4,5,"This isn't the kind of book I normally read, a...",Great Story!,Rating: 5 Title: Great Story! Content: This is...,77
3,6,3,I bought this book because I loved the cover a...,Hot Civil War Read... I wanted more Romance,Rating: 3 Title: Hot Civil War Read... I wante...,123
4,7,5,This was a book that I thoroughly enjoyed from...,Wow and wonderful read with a twist,Rating: 5 Title: Wow and wonderful read with a...,81


In [4]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,rating,reviewText,summary,review,word_count
0,541,4,This book begins with an overview of how your ...,"Short, but has interesting information on how ...","Rating: 4 Title: Short, but has interesting in...",266
1,542,4,I gave it a four only because it lagged in the...,Surprisingly Good,Rating: 4 Title: Surprisingly Good Content: I ...,52
2,543,5,How anyone could possibly complain about getti...,A whole lot of magic for less than a dollar,Rating: 5 Title: A whole lot of magic for less...,172
3,545,1,This Is The Worse Book I Have Ever Read!!!! Oh...,The Worse Book I Have Ever Read!!!!,Rating: 1 Title: The Worse Book I Have Ever Re...,43
4,546,3,I picked this up as a freebie on my Amazon Kin...,Something to read but not wonderful,Rating: 3 Title: Something to read but not won...,149


In [None]:
#test_reviews = test_df["reviewText"].tolist()

# Test Base Model

The base model doesn't really give any meaningful rating. It was unable to perform sentiment analysis.

In [5]:
# Load model and tokenizer
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu" # cpu or gpu depend on availability

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)  # or "cpu" based on your setup

# Function to generate model output
def get_model_response(review_text, device):
    # Format the question (rating prediction example)
    question = f"Review: {review_text}\n\nSentiment: Please provide a sentiment rating from 1 to 5, where:\n1 = Very Negative, 2 = Negative, 3 = Neutral, 4 = Positive, 5 = Very Positive.\n\nRating:"
    
    # Tokenize input
    inputs = tokenizer(question, return_tensors="pt").to(device)
    
    # Generate model output
    outputs = model.generate(inputs.input_ids, max_new_tokens=50, temperature=0.7, top_p=0.9, do_sample=True)
    
    # Decode the output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

# Test set predictions
predictions = []
for idx, row in test_df.iterrows():
    review_text = row['reviewText']  # Replace with your actual column name for review text
    model_response = get_model_response(review_text, device)  # Replace with "cpu" if using CPU
    predictions.append(model_response)

# Add predictions to DataFrame
test_df['based_response'] = predictions

test_df.head()

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Unnamed: 0.1,Unnamed: 0,rating,reviewText,summary,review,word_count,based_response
0,541,4,This book begins with an overview of how your ...,"Short, but has interesting information on how ...","Rating: 4 Title: Short, but has interesting in...",266,Review: This book begins with an overview of h...
1,542,4,I gave it a four only because it lagged in the...,Surprisingly Good,Rating: 4 Title: Surprisingly Good Content: I ...,52,Review: I gave it a four only because it lagge...
2,543,5,How anyone could possibly complain about getti...,A whole lot of magic for less than a dollar,Rating: 5 Title: A whole lot of magic for less...,172,Review: How anyone could possibly complain abo...
3,545,1,This Is The Worse Book I Have Ever Read!!!! Oh...,The Worse Book I Have Ever Read!!!!,Rating: 1 Title: The Worse Book I Have Ever Re...,43,Review: This Is The Worse Book I Have Ever Rea...
4,546,3,I picked this up as a freebie on my Amazon Kin...,Something to read but not wonderful,Rating: 3 Title: Something to read but not won...,149,Review: I picked this up as a freebie on my Am...


In [6]:
print(test_df.loc[5, 'based_response'])

Review: This book to me was sweet but way too short there was just so much more i need to know about what happens in their lives. I have enjoyed many books by Pepper Espinoza and like them all this was beautifully written. Josh and Peter's tale of finding each other was heart warming. Sometimes the thing you need most in you life is standing right in front of you and you just need to open your eye's and see it. I liked the fact that Josh and Peter Both found what they needed and now i just need to plead with Pepper "I need More"...

Sentiment: Please provide a sentiment rating from 1 to 5, where:
1 = Very Negative, 2 = Negative, 3 = Neutral, 4 = Positive, 5 = Very Positive.

Rating:


In [7]:
sample = pd.read_csv("sample_review.csv")

# Fine-tuning

In [8]:
# Define model checkpoint
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    checkpoint, 
    torch_dtype=torch.float16,  # Use float16 for efficiency
    device_map="auto"  # Auto-select GPU/CPU
)

In [9]:
# Define LoRA config
lora_config = LoraConfig(
    r=8,  # Rank of LoRA update matrices
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout probability
    target_modules=["q_proj", "v_proj"],  # Fine-tune attention layers only
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap model with LoRA adapter
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


trainable params: 1,572,864 || all params: 1,712,949,248 || trainable%: 0.0918


In [13]:
dataset = Dataset.from_pandas(sample[['reviewText', 'rating']])

dataset = dataset.rename_column('reviewText', 'text')  # Rename the text column to 'text'
dataset = dataset.rename_column('rating', 'label')    # Rename the rating column to 'label' (or keep it as 'rating' if it's for regression)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
#train_dataset = tokenized_datasets

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5
})

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./finetuned_model",  
    per_device_train_batch_size=4,  
    num_train_epochs=3,  
    learning_rate=2e-4,  
    fp16=True,  
    save_strategy="epoch",  
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets
)

# Start fine-tuning with LoRA
trainer.train()

# Save Model

In [None]:
model.save_pretrained("finetuned_smolLM")
tokenizer.save_pretrained("finetuned_smolLM")

model.push_to_hub("your-hf-username/finetuned_smolLM")
tokenizer.push_to_hub("your-hf-username/finetuned_smolLM")

# Test Fine-tuned Model

In [None]:
# Load model and tokenizer
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu" # cpu or gpu depend on availability

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)  # or "cpu" based on your setup

# Function to generate model output
def get_model_response(review_text, device):
    # Format the question (rating prediction example)
    question = f"Review: {review_text}\n\nSentiment: Please provide a sentiment rating from 1 to 5, where:\n1 = Very Negative, 2 = Negative, 3 = Neutral, 4 = Positive, 5 = Very Positive.\n\nRating:"
    
    # Tokenize input
    inputs = tokenizer(question, return_tensors="pt").to(device)
    
    # Generate model output
    outputs = model.generate(inputs.input_ids, max_new_tokens=50, temperature=0.7, top_p=0.9, do_sample=True)
    
    # Decode the output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

# Test set predictions
predictions = []
for idx, row in test_df.iterrows():
    review_text = row['reviewText']  # Replace with your actual column name for review text
    model_response = get_model_response(review_text, device)  # Replace with "cpu" if using CPU
    predictions.append(model_response)

# Add predictions to DataFrame
test_df['finetuned_response'] = predictions

test_df.head()