<a href="https://colab.research.google.com/github/aradeyal/machine_learning/blob/main/%D7%90%D7%AA%D7%92%D7%A8_%D7%9E%D7%A6%D7%99%D7%90%D7%AA_%D7%91%D7%99%D7%A7%D7%95%D7%A8%D7%95%D7%AA_%D7%93%D7%95%D7%9E%D7%95%D7%AA%E2%80%8E.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
from google.colab import files
print("בחר/י reviews.txt (או tripadvisor_hotel_reviews.csv עם עמודת 'Review'):")
uploaded = files.upload()

import os, io, pandas as pd
from pathlib import Path

for fname in uploaded.keys():
    if fname.lower().endswith(".csv"):
        df = pd.read_csv(io.BytesIO(uploaded[fname]))
        col = "Review" if "Review" in df.columns else None
        if not col:
            cand = [c for c in df.columns if str(c).lower() in {"review", "text", "comment", "content"}]
            if cand:
                col = cand[0]
        if not col:
            raise ValueError("לא נמצאה עמודת Review/Text/Comment/Content בקובץ ה-CSV.")
        df[col].astype(str).fillna("").to_csv("reviews.txt", index=False, header=False)
        print("✅ נוצר reviews.txt מתוך ה-CSV.")

REV_PATH = Path("reviews.txt")
if not REV_PATH.exists():
    raise FileNotFoundError("לא נמצא reviews.txt. העלה reviews.txt או CSV עם עמודת 'Review'.")


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def load_reviews_txt(path: Path) -> list[str]:
    lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
    texts = [ln.strip() for ln in lines if ln.strip()]
    if not texts:
        raise ValueError("reviews.txt ריק או לא תקין.")
    return texts

def top_k_similar(corpus: list[str], query: str, k: int = 5):
    vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
    X = vec.fit_transform(corpus)
    qv = vec.transform([query])
    sims = cosine_similarity(qv, X).ravel()
    k = min(k, len(corpus))
    idx = sims.argsort()[::-1][:k]
    return idx, sims[idx]

corpus = load_reviews_txt(REV_PATH)

print("הדביק/י ביקורת חדשה ולחץ/י Enter:")
query = input().strip()
if not query:
    raise SystemExit("שגיאה: לא הוזנה ביקורת חדשה.")

idx, scores = top_k_similar(corpus, query, k=5)

import pandas as pd
df = pd.DataFrame({
    "rank": range(1, len(idx)+1),
    "line_id": [f"line_{i}" for i in idx],
    "similarity": [float(f"{s:.4f}") for s in scores],
    "review": [corpus[i] for i in idx]
})

print("\nTop 5 similar reviews:")
for r, i, s in zip(df["rank"], idx, scores):
    txt = corpus[i]
    snippet = txt if len(txt) <= 300 else txt[:300] + " ..."
    print(f"{r}. [line_{i}] score={s:.4f}  {snippet}")

df


בחר/י reviews.txt (או tripadvisor_hotel_reviews.csv עם עמודת 'Review'):


Saving tripadvisor_hotel_reviews.csv to tripadvisor_hotel_reviews (4).csv
✅ נוצר reviews.txt מתוך ה-CSV.
הדביק/י ביקורת חדשה ולחץ/י Enter:
hated inn terrible, room-service horrible staff un-welcoming, decor recently updated lacks complete look, managment staff horrible,  

Top 5 similar reviews:
1. [line_65] score=1.0000  "hated inn terrible, room-service horrible staff un-welcoming, decor recently updated lacks complete look, managment staff horrible,  "
2. [line_10707] score=0.0740  "not good hotel theb staff un-helpfull make remarks laugh, rooms not booked, bathroom smelt sewer, hotel desperate need decorating updating,  "
3. [line_19929] score=0.0722  "horrible stay bad staff rooms horrible staff unwiling assist, hotel did not booking despite confirmed e-mail, manager wanted book 2 star hotel refusing got think worst room place.breakfast no choice.would stay hotel,  "
4. [line_3490] score=0.0569  "no major complaints stayed hotel summer, positives outweigh negatives, negatives lobb

Unnamed: 0,rank,line_id,similarity,review
0,1,line_65,1.0,"""hated inn terrible, room-service horrible sta..."
1,2,line_10707,0.074,"""not good hotel theb staff un-helpfull make re..."
2,3,line_19929,0.0722,"""horrible stay bad staff rooms horrible staff ..."
3,4,line_3490,0.0569,"""no major complaints stayed hotel summer, posi..."
4,5,line_3486,0.0541,"""horrible service horrible casino horrible hot..."
