# 1\) Sentiment Analysis

## 1.1\) TF-IDF + Logistic Regression

In [7]:
import pandas as pd, re, string, os
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# ---------------- 1. Load datasets ----------------
train_df = pd.read_csv("train_sets/train.csv")
test_df  = pd.read_csv("test_sets/sentiment-topic-test.tsv", sep="\t")

# ---------------- 2. Clean text ----------------
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)      # URLs
    text = re.sub(r'\d+', ' ', text)                        # digits
    text = text.translate(str.maketrans('', '', string.punctuation.replace("'", "")))  # keep apostrophes
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

tqdm.pandas(desc="🧹 Cleaning train")
train_df["clean_sentence"] = train_df["sentence"].progress_apply(clean_text)
tqdm.pandas(desc="🧹 Cleaning test")
test_df["clean_sentence"]  = test_df["sentence"].progress_apply(clean_text)

# ---------------- 3. Build + train pipeline ----------------
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=False,
        stop_words="english",
        ngram_range=(1,1),
        min_df=5,
        max_df=0.9,
        max_features=20000   # keep RAM low
    )),
    ("clf", LogisticRegression(
        max_iter=400,
        solver="liblinear",   # memory‑friendly
        class_weight="balanced"
    ))
])

print("\n⏳ Fitting Logistic Regression…")
model.fit(train_df["clean_sentence"], train_df["sentiment"])

# ---------------- 4. Predict + evaluate ----------------
preds = model.predict(test_df["clean_sentence"])
acc  = accuracy_score(test_df["sentiment"], preds)
print(f"\n✅ Test accuracy: {acc:.3%}\n")
print(classification_report(test_df["sentiment"], preds, digits=3))


🧹 Cleaning train:   0%|          | 0/49524 [00:00<?, ?it/s]

🧹 Cleaning test:   0%|          | 0/18 [00:00<?, ?it/s]


⏳ Fitting Logistic Regression…

✅ Test accuracy: 66.667%

              precision    recall  f1-score   support

    negative      0.556     0.833     0.667         6
     neutral      1.000     0.667     0.800         6
    positive      0.600     0.500     0.545         6

    accuracy                          0.667        18
   macro avg      0.719     0.667     0.671        18
weighted avg      0.719     0.667     0.671        18

