In [None]:
from pathlib import Path

def find_repo_root(start: Path = Path.cwd()) -> Path:
    for p in [start, *start.parents]:
        if (p / ".git").exists():
            return p
    return start

REPO   = find_repo_root()
DATA   = REPO / "data"
RAW    = DATA / "raw"
INTERM = DATA / "interim"
CLEAN  = DATA / "clean"
MODELS = REPO / "models"

for d in (RAW, INTERM, CLEAN, MODELS): d.mkdir(parents=True, exist_ok=True)

print("Repo root:", REPO)
print("Interim dir:", INTERM)


In [None]:
# 1) Imports
import os, pandas as pd
from transformers import pipeline

# 2) Load clean data
clean_path = "../data/clean/trustpilot_allianz_clean.csv"
df = pd.read_csv(clean_path).dropna(subset=["text"])

# 3) Auto-label 1–5 stars (CPU)
clf = pipeline("sentiment-analysis",
               model="nlptown/bert-base-multilingual-uncased-sentiment",
               device=-1)

def predict_stars(t: str):
    if not isinstance(t, str) or not t.strip(): return None
    out = clf(t[:512]); out = out[0] if isinstance(out, list) else out
    lbl = out.get("label","")
    for ch in lbl:
        if ch.isdigit(): return int(ch)
    return None

df["stars"] = df["text"].astype(str).apply(predict_stars)

# 4) Map stars -> sentiment
def stars_to_sentiment(s):
    if s is None: return "neutral"
    if s <= 2: return "negative"
    if s == 3: return "neutral"
    return "positive"

df["sentiment"] = df["stars"].apply(stars_to_sentiment)

# 5) Save scored
os.makedirs("../data/interim", exist_ok=True)
df.to_csv("../data/interim/trustpilot_scored.csv", index=False)
df["sentiment"].value_counts(dropna=False)


In [None]:
%pip install -q scikit-learn joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib

data = df.dropna(subset=["text","sentiment"])
strat = data["sentiment"] if data["sentiment"].value_counts().min() >= 2 else None

X_train, X_test, y_train, y_test = train_test_split(
    data["text"], data["sentiment"], test_size=0.25, random_state=42, stratify=strat
)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=1, max_features=20000)),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
print(classification_report(y_test, pred))

os.makedirs("../models", exist_ok=True)
joblib.dump(pipe, "../models/tfidf_logreg.joblib")


In [4]:
df.to_csv(INTERM / "trustpilot_scored.csv", index=False)

df_out = df.assign(date=pd.to_datetime(df["date"], utc=True, errors="coerce"))
df_out["month"] = df_out["date"].dt.tz_convert("Europe/Zurich").dt.strftime("%Y-%m")
agg = df_out.groupby(["month","sentiment"]).size().reset_index(name="count")
agg.to_csv(INTERM / "sentiment_by_month.csv", index=False)

In [5]:
df_out = df.assign(date=pd.to_datetime(df["date"], utc=True, errors="coerce"))
df_out["month"] = df_out["date"].dt.tz_convert("Europe/Zurich").dt.strftime("%Y-%m")
agg = df_out.groupby(["month","sentiment"]).size().reset_index(name="count")
agg.to_csv("../data/interim/sentiment_by_month.csv", index=False)

In [7]:
print(df["stars"].value_counts(dropna=False).sort_index())
print(df["sentiment"].value_counts(dropna=False))
print(classification_report(y_test, pred))

stars
1    33
2     1
5     6
Name: count, dtype: int64
sentiment
negative    34
positive     6
Name: count, dtype: int64
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         9
    positive       1.00      1.00      1.00         1

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

