In [1]:
#Essential Packages
# pip install transformers scikit-learn matplotlib seaborn wordcloud tqdm
# pip install torch torchaudio torchvision
# pip install charset-normalizer
# !pip install --upgrade jupyter ipywidgets
# !jupyter lab build

In [2]:
import os
import re
import json
import math
from pathlib import Path
from collections import defaultdict
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification



In [3]:
transformer_model = "distilbert-base-uncased-finetuned-sst-2-english"
batch_size=32
rating_positive_threshold=4.0
sentiment_pos_threshold=0.6
weight_rating=0.6
weight_text=0.4
alpha_composite=0.7
lda_n_topics=4
lda_max_features=2000
min_reviews_lda=10
random_state=42

In [4]:
input_dir=Path(".")
out_dir=Path("Output")
out_dir.mkdir(exist_ok=True,parents=True)
csv_files=[
    "ACER_REVIEWS_FIXED.csv",
    "ASUS_REVIEWS_FIXED.csv",
    "DELL_REVIEWS_FIXED.csv",
    "LENOVO_REVIEWS_FIXED.csv"
]

In [5]:
def find_csv(input_dir: Path):
    found_files=[]
    for i in csv_files:
        file_path=input_dir/i
        if file_path.exists():
            found_files.append(file_path)
    return found_files

In [6]:
def infer_company_name(name: str):
    s=name.lower()
    if "acer" in s:
        return "acer"
    if "asus" in s:
        return "asus"
    if "dell" in s:
        return "dell"
    if "lenovo" in s:
        return "lenovo"
    return ""

In [7]:
def load_concat(files):
    dfs=[]
    for f in files:
        try:
            df=pd.read_csv(f,encoding="utf-8", low_memory=False)
        except Exception:
            df=pd.read_csv(f,encoding="latin1", low_memory=False)
        df["__source_file"] = f.name
        if "company" not in df.columns:
            df["company"]=infer_company_name(f.name)
        dfs.append(df)
    return pd.concat(dfs,ignore_index=True,sort=False)

In [8]:
def autodetect(df: pd.DataFrame):
    text_col=None
    rating_col=None
    lower={c.lower(): c for c in df.columns}
    for c in ["text"]:
        if c in lower:
            text_col=lower[c];
            break
    for c in ["rating"]:
        if c in lower:
            rating_col=lower[c];
            break
    return text_col, rating_col

In [9]:
def clean_text(s):
    if pd.isna(s):
        return ""
    s=str(s).lower()
    s = re.sub(r"http\S+", " ", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s    

In [10]:
def wilson_lower_bound(pos, n, z=1.96):
    if n==0:
        return 0.0
    phat=pos/n
    denom= 1 + z*z/n
    num= phat + z*z/(2*n) - z * math.sqrt((phat*(1-phat) + z*z/(4*n))/n)
    return num/denom

In [11]:
tokenizer= AutoTokenizer.from_pretrained(transformer_model)
tf_model=TFAutoModelForSequenceClassification.from_pretrained(transformer_model, from_pt=True)
label_map=tf_model.config.id2label if hasattr(tf_model.config, "id2label") else {0: "NEGATIVE", 1: "POSITIVE"}

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [12]:
def sentiment_analysis(texts, batch_size=batch_size):
    labels=[]
    scores=[]
    signed=[]
    for i in range(0,len(texts),batch_size):
        batch=texts[i:i+batch_size]
        enc=tokenizer(batch, return_tensors="tf", padding=True, truncation=True, max_length=256)
        outputs=tf_model(enc)
        logits=outputs.logits.numpy()
        exp = np.exp(logits - np.max(logits,axis=1,keepdims=True))
        probs= exp/np.sum(exp,axis=1,keepdims=True)
        pred_idx=probs.argmax(axis=1)
        pred_score=probs[np.arange(len(pred_idx)),pred_idx]
        for idx,sc in zip(pred_idx, pred_score):
            lbl=label_map[int(idx)] if int(idx) in label_map else str(idx)
            labels.append(lbl)
            scores.append(float(sc))
            signed.append(float(sc) if str(lbl).lower().startswith("pos") else -float(sc))
    return labels,scores,signed

In [13]:
def main():
    files=find_csv(input_dir)
    df=load_concat(files)
    text_col, rating_col= autodetect(df)
    if "company" not in df.columns:
        df["company"]=df["__source_file"].apply(infer_company_name)
    df["company"]=df["company"].astype(str).str.lower().fillna("")
    mask = df["company"] == ""
    df.loc[mask, "company"] = df.loc[mask, "__source_file"].apply(infer_company_name)
    df["review_text"]=df[text_col].fillna("").astype(str) if text_col else ""
    df["clean_text"]=df["review_text"].apply(clean_text)
    if rating_col:
        df["rating_1_5"] = pd.to_numeric(df[rating_col],errors='coerce')
    else:
        df["rating_1_5"]=np.nan
    df["rating_01"]=df["rating_1_5"].apply(lambda x: (x-1)/4 if not pd.isna(x) else np.nan)
    texts=df["clean_text"].fillna("").astype(str).tolist()
    labels,scores,signed=sentiment_analysis(texts,batch_size=batch_size)
    df["transformer_label"]=labels
    df["transformer_score"]=scores
    df["transformer_signed"]=signed
    df["transformer_01"]= (df["transformer_signed"] + 1)/2.0
    def combine_satisfaction(r01,t01):
        try:
            r01 = float(r01) if not pd.isna(r01) else np.nan
        except (ValueError, TypeError):
            r01 = np.nan  
        try:
            t01 = float(t01) if not pd.isna(t01) else np.nan
        except (ValueError, TypeError):
            t01 = np.nan
        if not pd.isna(r01) and not pd.isna(t01):
            return weight_rating * r01 + weight_text * t01
        if not pd.isna(r01):
            return r01
        if not pd.isna(t01):
            return t01
        return np.nan
    df["satisfaction"]=df.apply(lambda r:combine_satisfaction(["rating_01"], r["transformer_01"]), axis=1)
    df["is_positive_rating"]=df["rating_1_5"].apply(lambda x: 1 if (not pd.isna(x) and float(x) >= rating_positive_threshold) else 0)
    df["is_positive_text"]=df["transformer_01"].apply(lambda x: 1 if (not pd.isna(x) and float(x) >= sentiment_pos_threshold) else 0)
    df["is_positive_review"]= ((df["is_positive_rating"] == 1) | (df["is_positive_text"] == 1)).astype(int)

    agg=df.groupby("company").agg(
        reviews_count=("review_text","count"),
        positive_reviews=("is_positive_review","sum"),
        avg_rating=("rating_1_5","mean"),
        avg_transformer_signed=("transformer_signed","mean"),
        avg_satisfaction=("satisfaction","mean")
    ).reset_index()
    scaler = MinMaxScaler()
    agg["wilson_lower"]=agg.apply(lambda r: wilson_lower_bound(int(r["positive_reviews"]), int(r["reviews_count"])),axis=1)
    agg["reviews_count_scaled"]=scaler.fit_transform(agg[["reviews_count"]]).flatten()
    agg["avg_satisfaction"]=agg["avg_satisfaction"].fillna(agg["avg_satisfaction"].median())
    agg["avg_satisfaction_scaled"]=scaler.fit_transform(agg[["avg_satisfaction"]]).flatten()
    agg["composite_scaled"] = alpha_composite * agg["avg_satisfaction_scaled"] + (1 - alpha_composite) * agg["reviews_count_scaled"]
    agg["composite_wilson"] = alpha_composite * agg["avg_satisfaction_scaled"] + (1 - alpha_composite) * agg["wilson_lower"]
    out_dir.mkdir(exist_ok=True,parents=True)
    agg.to_csv(out_dir / "company_summary.csv",index=False)
    df.to_csv(out_dir / "reviews_full.csv", index =False)
if __name__=="__main__":
    main()

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
