In [1]:
import pandas as pd

data_path = "../data/clean_data.csv"
df = pd.read_csv(data_path)

df.head()

Unnamed: 0,business_name,author_name,text,rating,rating_category
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,5,taste
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,4,menu
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,3,outdoor_atmosphere
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,5,indoor_atmosphere
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,3,menu


In [12]:
# extract lengths of each review
df["review_length"] = df["text"].apply(len)
df.head()


Unnamed: 0,business_name,author_name,text,rating,rating_category,review_length
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,5,taste,680
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,4,menu,914
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,3,outdoor_atmosphere,173
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,5,indoor_atmosphere,63
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,3,menu,161


### using a smaller set of the data to try feature engineering first

In [13]:
import spacy

df2 = df.head(10).copy()  # Always copy when modifying
nlp = spacy.load("en_core_web_sm")

tokens_list = []
for text in df2["text"]:
    doc = nlp(text)
    tokens_list.append([token.text for token in doc])

df2["tokens"] = tokens_list

df2

Unnamed: 0,business_name,author_name,text,rating,rating_category,review_length,tokens
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,5,taste,680,"[We, went, to, Marmaris, with, my, wife, for, ..."
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,4,menu,914,"[During, my, holiday, in, Marmaris, we, ate, h..."
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,3,outdoor_atmosphere,173,"[Prices, are, very, affordable, ., The, menu, ..."
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,5,indoor_atmosphere,63,"[Turkey, 's, cheapest, artisan, restaurant, an..."
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,3,menu,161,"[I, do, n't, know, what, you, will, look, for,..."
5,Haci'nin Yeri - Yigit Lokantasi,Arda Karaca,Generally good.,4,indoor_atmosphere,15,"[Generally, good, .]"
6,Haci'nin Yeri - Yigit Lokantasi,İrem Eren,What you see is 125 TL in total. It's a pretty...,5,taste,245,"[What, you, see, is, 125, TL, in, total, ., It..."
7,Haci'nin Yeri - Yigit Lokantasi,Nadia Salim,Delicious food at rock bottom prices. Friendly...,5,taste,70,"[Delicious, food, at, rock, bottom, prices, .,..."
8,Haci'nin Yeri - Yigit Lokantasi,Mehmet Eser,"Every time I go, I still experience the amazem...",5,outdoor_atmosphere,157,"[Every, time, I, go, ,, I, still, experience, ..."
9,Haci'nin Yeri - Yigit Lokantasi,Celal Ozer,The most f/p of all businesses I've seen.,5,indoor_atmosphere,41,"[The, most, f, /, p, of, all, businesses, I, '..."


In [14]:
from sentence_transformers import SentenceTransformer, util

# 1. Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

# 2. Encode all texts and reference words
text_embeddings = model.encode(df2["text"].tolist(), convert_to_tensor=True)
reference_embeddings = model.encode(["restaurant", "food"], convert_to_tensor=True)

# 3. Calculate pairwise cosine similarities
similarity_matrix = util.cos_sim(text_embeddings, reference_embeddings)

# 4. For each text, get similarity scores to both references
similarity_scores = []
for i in range(len(df2)):
    # Get similarity scores for this text to both reference words
    scores = similarity_matrix[i].tolist()
    similarity_scores.append(scores)

df2["similarity_scores"] = similarity_scores

# This will give you a list for each row: [similarity_to_restaurant, similarity_to_food]

In [15]:
df2

Unnamed: 0,business_name,author_name,text,rating,rating_category,review_length,tokens,similarity_scores
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,5,taste,680,"[We, went, to, Marmaris, with, my, wife, for, ...","[0.5075353980064392, 0.37266629934310913]"
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,4,menu,914,"[During, my, holiday, in, Marmaris, we, ate, h...","[0.4083253741264343, 0.30511540174484253]"
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,3,outdoor_atmosphere,173,"[Prices, are, very, affordable, ., The, menu, ...","[0.33979272842407227, 0.28572386503219604]"
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,5,indoor_atmosphere,63,"[Turkey, 's, cheapest, artisan, restaurant, an...","[0.477489173412323, 0.2990363538265228]"
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,3,menu,161,"[I, do, n't, know, what, you, will, look, for,...","[0.4761326313018799, 0.22589965164661407]"
5,Haci'nin Yeri - Yigit Lokantasi,Arda Karaca,Generally good.,4,indoor_atmosphere,15,"[Generally, good, .]","[0.1795356571674347, 0.23901411890983582]"
6,Haci'nin Yeri - Yigit Lokantasi,İrem Eren,What you see is 125 TL in total. It's a pretty...,5,taste,245,"[What, you, see, is, 125, TL, in, total, ., It...","[0.41842225193977356, 0.15947109460830688]"
7,Haci'nin Yeri - Yigit Lokantasi,Nadia Salim,Delicious food at rock bottom prices. Friendly...,5,taste,70,"[Delicious, food, at, rock, bottom, prices, .,...","[0.43825483322143555, 0.36904144287109375]"
8,Haci'nin Yeri - Yigit Lokantasi,Mehmet Eser,"Every time I go, I still experience the amazem...",5,outdoor_atmosphere,157,"[Every, time, I, go, ,, I, still, experience, ...","[0.0833774134516716, 0.07767816632986069]"
9,Haci'nin Yeri - Yigit Lokantasi,Celal Ozer,The most f/p of all businesses I've seen.,5,indoor_atmosphere,41,"[The, most, f, /, p, of, all, businesses, I, '...","[0.29386070370674133, 0.13103997707366943]"
