In [11]:
import pandas as pd
import openpyxl
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

file_path =  r'C:\Users\Multi 03\Desktop\code\reviewdata.xlsx'
review_data = pd.read_excel(file_path)

def preprocess_text(text):
    # Remove special characters and lowercase
    text = re.sub(r'[^a-zA-Z가-힣\s]', '', str(text))
    text = text.lower()
    # Tokenize and remove stop words
    tokens = [word for word in text.split() if word not in ENGLISH_STOP_WORDS]
    # Remove Korean particles (josa) and non-nouns
    josa = ['은', '는', '이', '가', '을', '를', '에', '의', '도', '로', '과', '와', '에서', '까지', '부터', '하다', '고', '겠', '들', '보다', '뿐', '가', '이', '가', '을', '를', '가', '에', '의', '로', '과', '와']
    tokens = [word for word in tokens if word not in josa]
    # Use a simple rule to filter out non-nouns (e.g., remove general adverbs or adjectives)
    tokens = [word for word in tokens if len(word) > 1 and not re.match(r'.*적으로|하다|있다|없다|되다|하다$', word)]
    return tokens

review_data['Processed_Text'] = review_data['Review'].fillna('').astype(str) + ' ' + review_data['Menu'].fillna('')
review_data['Tokens'] = review_data['Processed_Text'].apply(preprocess_text)

sentences = review_data['Tokens'].tolist()

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def text_to_vector(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

review_data['Vector'] = review_data['Tokens'].apply(lambda x: text_to_vector(x, model))

def find_similar_words(word, top_n=5):
    if word not in model.wv:
        return f"The word '{word}' is not in the vocabulary."
    similar_words = model.wv.most_similar(word, topn=top_n)
    return similar_words

def recommend_similar_menu_with_restaurant(menu_name, top_n=5):
    # Find the vector for the given menu
    menu_index = review_data[review_data['Menu'].str.contains(menu_name, na=False)].index
    if len(menu_index) == 0:
        return f"No menu found matching: {menu_name}"
    
    menu_vector = review_data.loc[menu_index[0], 'Vector']
    
    # Calculate cosine similarity with all other menus
    similarities = cosine_similarity([menu_vector], np.stack(review_data['Vector'].values))
    
    # Get indices of top N most similar menus (excluding the input menu itself)
    similar_indices = similarities[0].argsort()[-top_n-1:-1][::-1]
    
    # Retrieve recommended menus with restaurant information
    recommendations = review_data.iloc[similar_indices][['Restaurant', 'Menu', 'Review']]
    return recommendations

similar_words_example = find_similar_words("짜장면")
print("Words similar to '냉면':", similar_words_example)

similar_menus_with_restaurants = recommend_similar_menu_with_restaurant("물냉면")
print("Menus similar to '물냉면' with restaurant information:")
print(similar_menus_with_restaurants)

def recommend_restaurants_for_similar_words(base_word, top_n=5):
    if base_word not in model.wv:
        return f"The word '{base_word}' is not in the vocabulary."
    
    # Find top N most similar words
    similar_words = model.wv.most_similar(base_word, topn=top_n)
    similar_words_list = [word for word, _ in similar_words]
    
    # Filter menus containing similar words
    matched_menus = review_data[review_data['Tokens'].apply(lambda tokens: any(word in tokens for word in similar_words_list))]
    
    # Retrieve restaurants and menus
    recommendations = matched_menus[['Restaurant', 'Menu', 'Review']]
    return recommendations

restaurants_for_similar_words = recommend_restaurants_for_similar_words("냉면")

# Display the results
print("Restaurants and menus for words similar to '냉면':")
print(restaurants_for_similar_words)

# Function to recommend specific menus with their similarity scores and restaurant info
def recommend_specific_menu_and_restaurants(base_word, top_n=5):
    if base_word not in model.wv:
        return f"The word '{base_word}' is not in the vocabulary."
    
    # Find top N most similar words
    similar_words = model.wv.most_similar(base_word, topn=top_n)
    
    # Prepare results with similarity scores and matching restaurant info
    results = []
    for similar_word, similarity_score in similar_words:
        # Filter rows where the similar word appears in the menu or tokens
        matching_rows = review_data[review_data['Tokens'].apply(lambda tokens: similar_word in tokens)]
        
        for _, row in matching_rows.iterrows():
            results.append({
                "Similar Word": similar_word,
                "Similarity Score": similarity_score,
                "Restaurant": row['Restaurant'],
                "Menu": row['Menu'],
                "Review": row['Review']
            })
    
    # Convert to a DataFrame for better readability
    results_df = pd.DataFrame(results)
    return results_df

# Example usage: Find menus and restaurants similar to "냉면"
specific_menu_recommendations = recommend_specific_menu_and_restaurants("냉면")

# Display the results
print("Recommendations for menus and restaurants similar to '냉면':")
print(specific_menu_recommendations.to_string(index=False))


Words similar to '냉면': [('짬뽕', 0.9934630393981934), ('쫄면도', 0.9919245839118958), ('냉면이', 0.9906197786331177), ('오늘따라', 0.989641547203064), ('돈까스가', 0.9883478879928589)]
Menus similar to '물냉면' with restaurant information:
              Restaurant                                               Menu  \
43997         맘스터치-군산나운점                                                NaN   
30458  베트남쌀국수미스사이공-군산나운점                             소고기쌀국수/1(사이즈(보통)),분짜/1   
11391              나운생곱창                                        양념곱창 200g/2   
3574           뽕뜨락피자-나운점  클래식 L 1＋1/1(L토핑1(슈프림콤비),엣지1(오리지널),L토핑2(스윗고구마),...   
27901          뚜레쥬르-군산대점  애플망고 빙수(망고)/1(스푼 선택(스푼 1개)),순진우유크림빵/1,빵속에리얼초코/...   

                                                  Review  
43997                              가까워서 픽업하기 좋아서 시켜먹게 되요  
30458  미스사이공 너무 좋아해서 종종 가서 먹는데 이번에 첨으로 배달시켜봤어요 맛도 그대로...  
11391                                 길게 안쓸께요 그냥 최고입니다 👍  
3574   피자는 뽕드락이 최고 에요~ 도우가 좋아서 인지 다른 피자보다 소화도 잘 되는것 같...  
27901  아니