In [11]:
import pandas as pd

data_path = "../data/clean_data.csv"
df = pd.read_csv(data_path)

df.head()

Unnamed: 0,business_name,author_name,text,rating,rating_category
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,5,taste
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,4,menu
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,3,outdoor_atmosphere
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,5,indoor_atmosphere
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,3,menu


In [None]:
# extract lengths of each review
df["review_length"] = df["text"].apply(len)
df.head()

### using a smaller subset of the data, df2 to try feature engineering first

In [None]:
import spacy

df2 = df.head(30).copy()  # Always copy when modifying
nlp = spacy.load("en_core_web_sm")

tokens_list = []
for text in df2["text"]:
    doc = nlp(text)
    tokens_list.append([token.text for token in doc])

df2["tokens"] = tokens_list
df2

### we can also train this model for our own needs but idk if it will be easy

In [None]:
from sentence_transformers import SentenceTransformer, util

# 1. Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

# 2. Encode all texts and reference words
text_embeddings = model.encode(df2["text"].tolist(), convert_to_tensor=True)
reference_embeddings = model.encode(["restaurant", "food"], convert_to_tensor=True)

# 3. Calculate pairwise cosine similarities
similarity_matrix = util.cos_sim(text_embeddings, reference_embeddings)

print(similarity_matrix.shape)

# 4. For each text, get similarity scores to both references
similarity_scores = []
for i in range(len(df2)):
    # Get similarity scores for this text to both reference words
    scores = similarity_matrix[i].tolist()
    similarity_scores.append(scores)

df2["similarity_scores"] = similarity_scores
df2
# This will give you a list for each row: [similarity_to_restaurant, similarity_to_food]

In [None]:
from transformers import pipeline


# Step 2: Load the pre-trained sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")
labels = []
scores = []

for text in df2["text"]:
    result = sentiment_analyzer(text)[0]  # returns list of dicts
    label = result['label']
    score = result['score']
    labels.append(label)
    scores.append(score)

    print(f"Text: {text}")
    print(f" → Sentiment: {label} (confidence: {score:.4f})\n")

df2["sentiment"] = labels
df2["confidence score"] = scores
df2