In [1]:
import os
import pandas as pd
import ollama
from tqdm import tqdm

In [1]:
# Load Excel file
file_name = "SubjectAppsDataset_exp03.xlsx"
file_path = file_name
df = pd.read_excel(file_path, sheet_name="baseline")

# Count the number of records in the filtered DataFrame
record_count = len(df)

# Alternatively, you can use:
record_count = df.shape[0]

print(f"Number of records in the DataFrame: {record_count}")

BATCH_SIZE = 16

# Function to classify a batch of reviews
def classify_language_batch(texts):
    reviews_formatted = "\n\n".join([f"{i+1}. {text.strip()}" for i, text in enumerate(texts)])
    prompt = f"""
You are a language classification assistant. Classify the following user reviews into one of three categories:

- malay: if the review is entirely in the Malay language.
- english: if the review is entirely in English.
- mix: if the review contains both Malay and English.

Respond with only a list of the classifications in order, one per line.

Reviews:
{reviews_formatted}
"""
    response = ollama.chat(model='llama3.1', messages=[{'role': 'user', 'content': prompt}])
    raw_result = response['message']['content'].strip().lower()
    return [line.strip() for line in raw_result.splitlines() if line.strip()]

# Apply in batches
results = []
texts = df['Normalization'].tolist()

for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Classifying Reviews"):
    batch = texts[i:i + BATCH_SIZE]
    batch_results = classify_language_batch(batch)
    results.extend(batch_results)

# Assign to DataFrame
df['llama'] = results

# --- Save to Excel ---
def save_to_excel(df, path, sheet_name):
    with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False)

sheet_name = "baseline"
save_to_excel(df, file_path, sheet_name)
# --- Final Status ---
print(f"\n✅ File saved to '{file_path}' in sheet '{sheet_name}'.")