<a href="https://colab.research.google.com/github/aradeyal/machine_learning/blob/main/%D7%90%D7%AA%D7%92%D7%A8_2_%D7%96%D7%94%D7%95%D7%99_%D7%91%D7%A7%D7%95%D7%A8%D7%AA_%D7%9C%D7%92%D7%99%D7%98%D7%99%D7%9E%D7%99%D7%AA_%D7%90%D7%95_%D7%9E%D7%96%D7%95%D7%99%D7%A4%D7%AA_%D7%91%D7%A1%D7%99%D7%95%D7%A2_perplexity%E2%80%8E.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:


# ---- Step 1: Upload data (TXT or CSV) ----
from google.colab import files
print("Upload reviews.txt (one review per line) OR a CSV with a 'Review' column:")
uploaded = files.upload()

import io, re, math, random
import pandas as pd
from pathlib import Path
from collections import Counter
from typing import List, Tuple

# ---- Load corpus from uploaded file(s) ----
def load_corpus_from_upload(uploaded_dict) -> List[str]:
    # Prefer TXT if provided
    for fname in uploaded_dict.keys():
        if fname.lower().endswith(".txt"):
            lines = io.BytesIO(uploaded_dict[fname]).read().decode("utf-8", errors="ignore").splitlines()
            texts = [ln.strip() for ln in lines if ln.strip()]
            if texts:
                return texts
    # Otherwise, try CSV
    for fname in uploaded_dict.keys():
        if fname.lower().endswith(".csv"):
            df = pd.read_csv(io.BytesIO(uploaded_dict[fname]))
            col = "Review" if "Review" in df.columns else None
            if not col:
                cand = [c for c in df.columns if str(c).lower() in {"review", "text", "comment", "content"}]
                if cand:
                    col = cand[0]
            if not col:
                raise ValueError("No Review/Text/Comment/Content column found in the CSV.")
            texts = df[col].astype(str).fillna("").tolist()
            texts = [t.strip() for t in texts if t.strip()]
            if texts:
                return texts
    raise FileNotFoundError("No valid reviews.txt or CSV was uploaded.")

corpus_raw = load_corpus_from_upload(uploaded)
print(f"Loaded {len(corpus_raw)} reviews.")

# ---- Step 2: Tokenization (supports English + Hebrew letters) ----
def tokenize(s: str) -> List[str]:
    s = s.lower()
    s = re.sub(r"[^a-z0-9\u0590-\u05FF]+", " ", s)  # keep English letters/digits + Hebrew letters
    tokens = s.split()
    return tokens

tokenized_corpus = [tokenize(t) for t in corpus_raw if t]
tokenized_corpus = [toks for toks in tokenized_corpus if toks]
print(f"Usable reviews after tokenization: {len(tokenized_corpus)}")

# ---- Step 3: Trigram LM with Add-alpha smoothing ----
BOS = "<s>"
EOS = "</s>"
UNK = "<unk>"

def build_vocab(tokenized: List[List[str]], min_count: int = 2) -> set:
    cnt = Counter(token for doc in tokenized for token in doc)
    vocab = {w for w, c in cnt.items() if c >= min_count}
    vocab |= {BOS, EOS, UNK}
    return vocab

def replace_oov(tokenized: List[List[str]], vocab: set) -> List[List[str]]:
    out = []
    for doc in tokenized:
        out.append([w if w in vocab else UNK for w in doc])
    return out

class TrigramLM:
    def __init__(self, alpha: float = 0.5):
        self.alpha = alpha
        self.vocab = set()
        self.uni = Counter()
        self.bi  = Counter()
        self.tri = Counter()
        self.V = 0

    def fit(self, tokenized_docs: List[List[str]]):
        for doc in tokenized_docs:
            seq = [BOS, BOS] + doc + [EOS]
            for i in range(len(seq)):
                self.uni[(seq[i],)] += 1
                if i >= 1:
                    self.bi[(seq[i-1], seq[i])] += 1
                if i >= 2:
                    self.tri[(seq[i-2], seq[i-1], seq[i])] += 1
        self.vocab = {w for (w,) in self.uni.keys()}
        self.V = len(self.vocab)

    def logprob(self, doc_tokens: List[str]) -> float:
        seq = [BOS, BOS] + doc_tokens + [EOS]
        lp = 0.0
        for i in range(2, len(seq)):
            tri = (seq[i-2], seq[i-1], seq[i])
            bi_prefix = (seq[i-2], seq[i-1])
            num = self.tri.get(tri, 0) + self.alpha
            den = self.bi.get(bi_prefix, 0) + self.alpha * self.V
            p = num / den
            lp += math.log(p)
        return lp

    def perplexity(self, doc_tokens: List[str]) -> float:
        M = len(doc_tokens) + 1  # number of trigram transitions over tokens (+EOS)
        if M <= 0:
            return float("inf")
        return math.exp(- self.logprob(doc_tokens) / M)

# ---- Step 4: Train/Validation split and automatic threshold ----
random.seed(42)
idxs = list(range(len(tokenized_corpus)))
random.shuffle(idxs)
cut = max(1, int(0.8 * len(idxs)))
train_idx, valid_idx = idxs[:cut], idxs[cut:]

train_docs = [tokenized_corpus[i] for i in train_idx]
valid_docs = [tokenized_corpus[i] for i in valid_idx]

vocab = build_vocab(train_docs, min_count=2)
train_docs = replace_oov(train_docs, vocab)
valid_docs = replace_oov(valid_docs, vocab)

lm = TrigramLM(alpha=0.5)
lm.fit(train_docs)

# Compute perplexity distribution on validation and set threshold at 95th percentile
import numpy as np
valid_pp = [lm.perplexity(doc) for doc in valid_docs] or [lm.perplexity(doc) for doc in train_docs[-min(200, len(train_docs)):]]
threshold = float(np.percentile(valid_pp, 95))
print(f"Perplexity threshold (95th percentile): {threshold:.2f}")

# ---- Step 5: Classifier function ----
def classify_review(text: str, threshold_value: float = None) -> Tuple[float, str]:
    toks = tokenize(text)
    toks = [w if w in vocab else UNK for w in toks] if toks else [UNK]
    pp = lm.perplexity(toks)
    thr = threshold if threshold_value is None else threshold_value
    label = "legit" if pp <= thr else "fake"
    return pp, label

# ---- Step 6: Quick demo ----
examples = [
    "The room was clean and the staff were great. Excellent location.",
    "Buy cheap watches!!! click here http://spam.example quick money now",
]
for txt in examples:
    pp, lab = classify_review(txt)
    print(f"\nReview: {txt}\nPerplexity: {pp:.2f} -> {lab}")

print("\nPaste a review and press Enter:")
user_text = input().strip()
if user_text:
    pp, lab = classify_review(user_text)
    print(f"Perplexity: {pp:.2f} -> {lab}")
else:
    print("No review entered.")


Upload reviews.txt (one review per line) OR a CSV with a 'Review' column:


Saving tripadvisor_hotel_reviews.csv to tripadvisor_hotel_reviews.csv
Loaded 20491 reviews.
Usable reviews after tokenization: 20491
Perplexity threshold (95th percentile): 19494.71

Review: The room was clean and the staff were great. Excellent location.
Perplexity: 16960.41 -> legit

Review: Buy cheap watches!!! click here http://spam.example quick money now
Perplexity: 23056.28 -> fake

Paste a review and press Enter:
cozy stay rainy city, husband spent 7 nights monaco early january 2008. business trip chance come ride.we booked monte carlo suite proved comfortable longish stay, room 905 located street building, street noise not problem view interesting rooms building look dank alley midsection large office building, suite comfortable plenty room spread, bathroom attractive squeaky clean small comparison generous proportions sitting sleeping areas, lots comfortable seating options good lighting plenty storage clothing luggage, hotel staff friendly efficient, housekeeping staff did g