### Walkthrough

To show the inner working of the code, we use a small subset of the data.

### Import Required Packages

In [11]:
import pandas as pd
import os
import spacy
import re
from transformers import pipeline
from huggingface_hub import InferenceClient

### Load Sample

In [12]:
data_path = '../data/clean/cleaned_combined_reviews.csv'
df = pd.read_csv(data_path)

df2 = df.iloc[1500:1550].copy()
df2.shape
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 1500 to 1549
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_name    50 non-null     object
 1   review_text  50 non-null     object
 2   rating       50 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 1.3+ KB


### Label data

In [13]:
client = InferenceClient(
    provider="nscale",
    api_key=os.environ["HF_TOKEN"],
)

FEW_SHOT_EXAMPLES = """
You are a system that classifies Google location reviews into one of four categories:
- Ad: Promotional or advertisement content.
- Rant: Angry or exaggerated complaints, often with excessive punctuation or all-caps.
- Irrelevant: Not related to the location being reviewed.
- Valid: A genuine and relevant review about the location.

Examples:
Review: "Best pizza in town! Fresh ingredients and great service."
Label: Valid

Review: "BUY ONE GET ONE FREE! Come to my shop now, limited offer!"
Label: Ad

Review: "THIS PLACE IS THE WORST!!! NEVER COMING BACK. HORRIBLE SERVICE!!!!!"
Label: Rant

Review: "I think the government is doing a terrible job with taxes."
Label: Irrelevant
"""

def create_batch_prompt(reviews):
    reviews_list = "\n".join(
        [f"Review {i+1}: {r}" for i, r in enumerate(reviews)]
    )
    return f"""{FEW_SHOT_EXAMPLES}
    Now classify the following reviews:
    {reviews_list}

    Output format:
    Review 1: <Label>
    Review 2: <Label>
    ...
    """

def classify_batch(reviews):
    prompt = create_batch_prompt(reviews)
    response = client.chat.completions.create(
        model="Qwen/Qwen3-4B-Instruct-2507",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message["content"]

def label_reviews(df, text_col="review_text", batch_size=100, max_reviews=1000):
    labeled = []
    reviews = df[text_col].astype(str).tolist()[:max_reviews]
    
    for i in range(0, len(reviews), batch_size):
        batch = reviews[i:i+batch_size]
        output = classify_batch(batch)

        # Parse output: expects "Review 1: Valid" style
        for line in output.splitlines():
            if line.strip() and line.startswith("Review"):
                try:
                    idx, label = line.split(":", 1)
                    labeled.append(label.strip())
                except:
                    labeled.append("Unknown")
        
        print(f"Processed {i+len(batch)} / {len(reviews)}")
    
    df = df.iloc[:max_reviews].copy()
    df["label"] = labeled
    return df

In [14]:
df2 = label_reviews(df2)
print(df2.info())
print(df2.head())

Processed 50 / 50
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 1500 to 1549
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_name    50 non-null     object
 1   review_text  50 non-null     object
 2   rating       50 non-null     int64 
 3   label        50 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.7+ KB
None
              user_name                                        review_text  \
1500            Craig H  I’ve had my Sanctuary 2 sauna and red light fo...   
1501   Dr Harold Patino  What can I say! Julie was fantastic and the pr...   
1502   Lauren Kilbourne  Thank you so much Julie for your support in pu...   
1503  Natural Baby Mama  Julie has been amazing throughout the entire b...   
1504     Holly McGreevy  Working with Julie was an absolute pleasure! J...   

      rating  label  
1500       5  Valid  
1501       5  Valid  
1502       5  Valid  
1503       5  Val

### Feature Extraction

In [15]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop and not token.is_punct]

def all_caps_ratio(text):
    words = text.split()
    caps_words = [word for word in words if word.isupper()]
    return len(caps_words) / len(words) if len(words) > 0 else 0

sentiment_analyzer = pipeline("sentiment-analysis")

def calculate_relevancy_score(text):
    # Can adjust this score based on sentiment, length, and keyword presence
    # For example, the higher the sentiment score and review length, the more relevant it is
    sentiment_score = sentiment_analyzer(text)[0]['score']
    review_length = len(text.split())
    caps_ratio = all_caps_ratio(text)
    
    # Simple heuristic: higher sentiment, longer review, lower caps_ratio = more trustworthy
    relevancy_score = (sentiment_score * 40) + (review_length * 0.5) - (caps_ratio * 20)
    return min(max(int(relevancy_score), 0), 100)  # Ensure the score is between 0 and 100
import torch
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import Dataset, DataLoader

# -----------------------
# Load local model + tokenizer
# -----------------------

model_path = "../models/trained_model"

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=model_path,
    local_files_only=True,
    trust_remote_code=False
)

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=model_path,
    local_files_only=True,
    trust_remote_code=False
)

# -----------------------
# Define labels (must match training)
# -----------------------
LABELS = [
    'admiration','amusement','anger','annoyance','approval','caring',
    'confusion','curiosity','desire','disappointment','disapproval','disgust',
    'embarrassment','excitement','fear','gratitude','grief','joy','love',
    'nervousness','optimism','pride','realization','relief','remorse',
    'sadness','surprise','neutral'
]

# -----------------------
# Dataset class for inference
# -----------------------
class InferenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = str(self.dataframe.iloc[idx]['review_text'])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
        }

# -----------------------
# Inference function
# -----------------------
def get_predictions_for_dataframe(
    dataframe, model, tokenizer, batch_size=16, device=None,
    multi_label=True, threshold=0.5
):
    """
    Run inference on dataframe["text"] using model.
    
    multi_label=True  -> sigmoid + threshold (multi-label classification)
    multi_label=False -> softmax + argmax (single-label classification)
    """
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    dataset = InferenceDataset(dataframe, tokenizer)
    loader = DataLoader(dataset, batch_size=batch_size)

    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            if multi_label:
                # multi-label: sigmoid + threshold
                probs = torch.sigmoid(logits)
                batch_predictions = (probs > threshold).int().cpu().tolist()
            else:
                # single-label: softmax + argmax
                probs = torch.softmax(logits, dim=-1)
                batch_predictions = torch.argmax(probs, dim=-1).cpu().tolist()

            predictions.extend(batch_predictions)

    if multi_label:
        # One column per label (0/1)
        predictions_df = pd.DataFrame(predictions, columns=LABELS)

        # Add human-readable labels
        readable_labels = []
        for row in predictions_df.values:
            active = [LABELS[i] for i, val in enumerate(row) if val == 1]
            readable_labels.append(", ".join(active) if active else "none")
        predictions_df["predicted_labels"] = readable_labels

        result_df = pd.concat([dataframe.reset_index(drop=True), predictions_df], axis=1)
    else:
        # Single-label: just map index -> label
        readable_labels = [LABELS[idx] for idx in predictions]
        result_df = dataframe.copy().reset_index(drop=True)
        result_df["predicted_label"] = readable_labels

    return result_df["predicted_labels"]

def engineer_features(df):
    df['cleaned_review_text'] = df['review_text'].apply(clean_text)
    df['tokenized_review'] = df['cleaned_review_text'].apply(tokenize)
    df['review_length'] = df['tokenized_review'].apply(len)
    df['all_caps_ratio'] = df['review_text'].apply(all_caps_ratio)
    df['sentiment'] = df['cleaned_review_text'].apply(lambda x: sentiment_analyzer(x)[0]['label'])
    df['relevancy_score'] = df['review_text'].apply(calculate_relevancy_score)
    emotions = get_predictions_for_dataframe(df, model, tokenizer, multi_label=True)
    df["emotions"] = emotions
    return df



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [16]:
df2 = engineer_features(df2)
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 1500 to 1549
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   user_name            50 non-null     object 
 1   review_text          50 non-null     object 
 2   rating               50 non-null     int64  
 3   label                50 non-null     object 
 4   cleaned_review_text  50 non-null     object 
 5   tokenized_review     50 non-null     object 
 6   review_length        50 non-null     int64  
 7   all_caps_ratio       50 non-null     float64
 8   sentiment            50 non-null     object 
 9   relevancy_score      50 non-null     int64  
 10  emotions             0 non-null      object 
dtypes: float64(1), int64(3), object(7)
memory usage: 4.4+ KB


Unnamed: 0,user_name,review_text,rating,label,cleaned_review_text,tokenized_review,review_length,all_caps_ratio,sentiment,relevancy_score,emotions
1500,Craig H,I’ve had my Sanctuary 2 sauna and red light fo...,5,Valid,ive had my sanctuary sauna and red light for ...,"[ve, sanctuary, , sauna, red, light, month, l...",17,0.0,POSITIVE,55,
1501,Dr Harold Patino,What can I say! Julie was fantastic and the pr...,5,Valid,what can i say julie was fantastic and the pro...,"[julie, fantastic, product, , sactuary, , fa...",34,0.029851,POSITIVE,72,
1502,Lauren Kilbourne,Thank you so much Julie for your support in pu...,5,Valid,thank you so much julie for your support in pu...,"[thank, julie, support, purchasing, clearlight...",56,0.034188,POSITIVE,97,
1503,Natural Baby Mama,Julie has been amazing throughout the entire b...,5,Valid,julie has been amazing throughout the entire b...,"[julie, amazing, entire, buying, process, , w...",26,0.0,POSITIVE,65,
1504,Holly McGreevy,Working with Julie was an absolute pleasure! J...,5,Valid,working with julie was an absolute pleasure ju...,"[working, julie, absolute, pleasure, julie, ho...",34,0.080645,POSITIVE,69,


In [17]:
df2.to_csv("../data/walk_through/walk_through.csv", index=False)