# import

In [None]:
%%capture
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fm.fontManager.addfont('/usr/share/fonts/truetype/nanum/NanumGothic.ttf')
plt.rcParams['font.family'] = 'NanumGothic'

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns

In [None]:
business = pd.read_csv('/content/drive/MyDrive/yelp_df/df2.csv')
print(business.shape)
print(business.columns)

(19534, 16)
Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'StableIndex', 'RecoveryRate'],
      dtype='object')


In [None]:
review = pd.read_parquet('/content/drive/MyDrive/yelp_df/review_filtered.parquet')
print(review.shape)
print(review.columns)

(2731190, 9)
Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')


In [None]:
user = pd.read_parquet('/content/drive/MyDrive/yelp_clean/user.parquet')
print(user.shape)
print(user.columns)

(1058123, 22)
Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')


# 신뢰 지수

## 1. 전문가

In [None]:
business['categories'] = business['categories'].fillna('').str.split(', ')
business_exploded = business.explode('categories')
business_exploded['categories'] = business_exploded['categories'].str.strip()
business_exploded = business_exploded[business_exploded['categories'] != '']

In [None]:
remove_keywords = ['restaurants', 'food']
business_exploded = business_exploded[
    ~business_exploded['categories'].str.lower().isin(remove_keywords)
]

In [None]:
category_count = (
    business_exploded['categories']
    .value_counts()
    .reset_index()
    .rename(columns={'index': 'Category', 'categories': 'Count'})
)

In [None]:
category_count['Rank'] = category_count.index + 1
category_count.head(10)

Unnamed: 0,Count,count,Rank
0,Nightlife,3975,1
1,Bars,3896,2
2,American (Traditional),3638,3
3,Sandwiches,3073,4
4,American (New),2892,5
5,Breakfast & Brunch,2821,6
6,Pizza,2515,7
7,Italian,2039,8
8,Burgers,2001,9
9,Mexican,1924,10


In [None]:
df = review.merge(
    business_exploded[['business_id', 'categories']],
    on='business_id',
    how='left'
)

In [None]:
user_cat = (
    df.groupby(['user_id', 'categories'])
      .size()
      .reset_index(name='review_count')
)

In [None]:
user_cat['total_reviews'] = user_cat.groupby('user_id')['review_count'].transform('sum')
user_cat['category_ratio'] = user_cat['review_count'] / user_cat['total_reviews']

In [None]:
top3 = (
    user_cat
    .sort_values(['user_id', 'category_ratio'], ascending=[True, False])
    .groupby('user_id')
    .head(3)
)

In [None]:
user_expert_cats = (
    top3.groupby('user_id')['categories']
        .apply(list)
        .reset_index()
        .rename(columns={'categories': 'top3_categories'})
)

user_expert_cats.head()

Unnamed: 0,user_id,top3_categories
0,---2PmXbF47D870stH1jqA,"[American (New), American (Traditional), Seafood]"
1,---UgP94gokyCDuB5zUssA,"[Seafood, American (New), Bars]"
2,---r61b7EpVPkb4UVme5tA,"[Italian, Pizza, American (New)]"
3,--0Jj_J_MmUJ51f1Y394Uw,[Seafood]
4,--0S2HVJui8bEa2iVgUisg,"[American (New), American (Traditional), Bars]"


In [None]:
biz_reviews = business.merge(
    review[['business_id', 'user_id', 'stars']],
    on='business_id',
    how='left'
)

In [None]:
biz_reviews = biz_reviews.merge(
    user_expert_cats,
    on='user_id',
    how='left'
)

In [None]:
def is_expert_user(expert_cats, biz_cats):

    if isinstance(expert_cats, list) and isinstance(biz_cats, list):
        return int(bool(set(expert_cats) & set(biz_cats)))
    return 0

In [None]:
def clean_categories(val):
    if isinstance(val, list):
        return [c.strip() for c in val if isinstance(c, str) and c.strip() != '']
    elif isinstance(val, str):
        if val.lower() in ['nan', 'none', '']:
            return []
        return [c.strip() for c in val.split(',') if c.strip() != '']
    else:
        return []

biz_reviews['categories'] = biz_reviews['categories'].apply(clean_categories)

In [None]:
biz_reviews['is_expert_user'] = biz_reviews.apply(
    lambda x: is_expert_user(x['top3_categories'], x['categories']),
    axis=1
)

biz_reviews.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,review_count,is_open,attributes,categories,hours,StableIndex,RecoveryRate,user_id,stars_y,top3_categories,is_expert_user
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,Q5I2xTcaQ22bmE_mp2q_Rw,4.0,"[Italian, Bars, Nightlife]",0
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,0_FWb5d-EkJVaP0GAr0gcQ,4.0,"[Pizza, American (Traditional), Breakfast & Br...",0
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,nnwBdqGHIAJQ5QX9lHOtrQ,3.0,"[Chinese, Bars, Nightlife]",0
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,0_FWb5d-EkJVaP0GAr0gcQ,4.0,"[Pizza, American (Traditional), Breakfast & Br...",0
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,nDFRVVcNLLUt-F_s0yBIPA,3.0,"[Bakeries, Bubble Tea, Burgers]",1


"하나의 유저가 특정 식당 (business)에 남긴 리뷰" 를 의미하고 그 리뷰가 전문가 리뷰인지 아닌지 (is_expert_user)를 보여주는 상태

- 1 : 유저의 top3 전문 카테고리 중 하나가 식당 카테고리에 포함 -> 전문가 리뷰
- 0 : 일치하는 카테고리 없음 -> 일반 리뷰

## 2. 엘리트

In [None]:
biz_reviews = biz_reviews.merge(
    user[['user_id', 'elite']],
    on='user_id',
    how='left'
)

In [None]:
biz_reviews['is_elite_user'] = biz_reviews['elite'].astype(str).apply(
    lambda x: 1 if len(x) > 2 and x.lower() not in ['nan', 'none', ''] else 0
)

## 3. 리뷰 활동량

In [None]:
user_activity = user[['user_id', 'review_count']].copy()

In [None]:
scaler = MinMaxScaler()
user_activity['activity_score'] = scaler.fit_transform(
    user_activity[['review_count']]
)

In [None]:
biz_reviews = biz_reviews.merge(
    user_activity[['user_id', 'activity_score']],
    on='user_id',
    how='left'
)

biz_reviews.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,review_count,...,hours,StableIndex,RecoveryRate,user_id,stars_y,top3_categories,is_expert_user,elite,is_elite_user,activity_score
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,...,"{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,Q5I2xTcaQ22bmE_mp2q_Rw,4.0,"[Italian, Bars, Nightlife]",0,200820092010,1,0.004407
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,...,"{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,0_FWb5d-EkJVaP0GAr0gcQ,4.0,"[Pizza, American (Traditional), Breakfast & Br...",0,20082009,1,0.00578
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,...,"{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,nnwBdqGHIAJQ5QX9lHOtrQ,3.0,"[Chinese, Bars, Nightlife]",0,"2008,2009,2010,2011,2012,2013,2014,2015,2016,2...",1,0.190408
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,...,"{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,0_FWb5d-EkJVaP0GAr0gcQ,4.0,"[Pizza, American (Traditional), Breakfast & Br...",0,20082009,1,0.00578
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,...,"{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,nDFRVVcNLLUt-F_s0yBIPA,3.0,"[Bakeries, Bubble Tea, Burgers]",1,,0,0.002232


## 최종

In [None]:
Reliability = (
    biz_reviews.groupby('business_id')
    .agg(
        expert_ratio=('is_expert_user', 'mean'),
        elite_ratio=('is_elite_user', 'mean'),
        user_activity_score=('activity_score', 'mean')
    )
    .reset_index()
)

In [None]:
Reliability['expert_adjusted'] = np.log1p(Reliability['expert_ratio'] * 9) / np.log(10)

In [None]:
scaler = MinMaxScaler()
Reliability[['expert_scaled', 'elite_scaled', 'activity_scaled']] = scaler.fit_transform(
    Reliability[['expert_adjusted', 'elite_ratio', 'user_activity_score']]
)

In [None]:
Reliability['reliability_score'] = (
    Reliability['expert_scaled'] * 40 +
    Reliability['elite_scaled'] * 30 +
    Reliability['activity_scaled'] * 30)

In [None]:
Reliability['reliability_score'] = MinMaxScaler(feature_range=(0, 100)).fit_transform(
    Reliability[['reliability_score']]
)

In [None]:
biz_with_reliability = business.merge(
    Reliability[['business_id', 'expert_scaled', 'elite_scaled', 'activity_scaled',  'reliability_score']],
    on='business_id',
    how='left'
)

In [None]:
biz_with_reliability.to_csv('business_reliability.csv', index=False)

In [None]:
from google.colab import files
files.download('business_reliability.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df = pd.read_csv('/content/business_reliability.csv')

In [None]:
df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,StableIndex,RecoveryRate,expert_scaled,elite_scaled,activity_scaled,reliability_score
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107.0,39.955505,-75.155564,4.0,80,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","['Restaurants', 'Food', 'Bubble Tea', 'Coffee ...","{'Friday': '7:0-21:0', 'Monday': '7:0-20:0', '...",57.431377,0.0,0.768051,0.538793,0.266023,64.011654
1,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227.0,39.637133,-86.127217,2.5,28,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","['American (Traditional)', 'Restaurants', 'Din...","{'Friday': '6:0-22:0', 'Monday': '6:0-22:0', '...",26.924359,0.0,0.942436,0.179598,0.027779,48.460135
2,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771.0,27.916116,-82.760461,4.5,100,0,"{'AcceptsInsurance': None, 'AgesAllowed': None...","['Food', 'Delis', 'Italian', 'Bakeries', 'Rest...","{'Friday': '10:0-20:0', 'Monday': '10:0-18:0',...",60.376124,0.0,0.819792,0.186714,0.075608,43.832936
3,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106.0,39.953949,-75.143226,4.0,245,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","['Sushi Bars', 'Restaurants', 'Japanese']","{'Friday': '13:30-23:0', 'Monday': None, 'Satu...",65.223712,0.2,0.692318,0.304167,0.078784,41.730439
4,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147.0,39.943223,-75.162568,4.5,205,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","['Korean', 'Restaurants']","{'Friday': '11:30-20:30', 'Monday': '11:30-20:...",74.454651,0.125,0.618953,0.325521,0.195003,43.424521
