In [1]:
import pandas as pd
import numpy as np
import re
import sys
import ast

df = pd.read_csv('./data/p_combined_modified.csv')
np.set_printoptions(threshold=sys.maxsize)

In [2]:
## Category ##
def string_to_list(s):
    s = s.strip("[]")
    return [part.strip().strip("'\"") for part in s.split(',')]

def convert_to_list_or_string(x):
    if isinstance(x, str):
        if '[' in x and ']' in x:
            try:
                return ast.literal_eval(x)
            except ValueError:
                return string_to_list(x)
        else:
            return string_to_list(x)
    else:
        return x

df['category'] = df['category'].apply(convert_to_list_or_string)

In [3]:
## Services/Parking ##
if df['services'].isnull().any():
    df['services'] = df['services'].fillna('[]')
df['services'] = df['services'].apply(ast.literal_eval)

df['services'] = df['services'].apply(
    lambda service_list: ['발레파킹' if service == '발렛파킹' else service for service in service_list]
)

def extract_parking_details(parking):
    if pd.isnull(parking):
        return '불가', None
    if isinstance(parking, str):
        parking = parking.strip()
        match = re.match(r'(가능|불가|발레 파킹)\s*\(([^)]+)\)', parking)
        if match:
            return match.group(1), match.group(2)
        else:
            return parking, None
    else:
        return parking, None

df[['parking', 'parkingDetail']] = df['parking'].apply(lambda x: pd.Series(extract_parking_details(x)))

def process_parking_and_services(row):
    parking = row['parking']
    services = row['services']
    
    if isinstance(parking, str) and '발레' in parking:
        parking = '가능'
        if '발레파킹' not in services:
            services.append('발레파킹')
    
    return parking, services

df[['parking', 'services']] = df.apply(lambda row: pd.Series(process_parking_and_services(row)), axis=1)

In [4]:
## Price ##
df['priceRange_b'] = df['priceRange_b'].str.strip()

conditions = [
    (df['priceRange_b'] == '1만원 미만') | (df['priceRange_b'] == '1~2만원대'),
    (df['priceRange_b'] == '2~5만원대'),
    (df['priceRange_b'] == '5~10만원대') | (df['priceRange_b'] == '10~15만원대'),
    (df['priceRange_b'] == '15~25만원대') | (df['priceRange_b'] == '25만원 이상'),
    (df['priceRange_b'].isna()) & (df['priceRange_m'] == '₩'),
    (df['priceRange_b'].isna()) & (df['priceRange_m'] == '₩₩'),
    (df['priceRange_b'].isna()) & (df['priceRange_m'] == '₩₩₩'),
    (df['priceRange_b'].isna()) & (df['priceRange_m'] == '₩₩₩₩')
]

values = ['$', '$$', '$$$', '$$$$', '$', '$$', '$$$', '$$$$'] 
df['price'] = np.select(conditions, values, default='')

In [5]:
category = sorted(set([c for ca in df['category'] for c in ca]))
services = sorted(set([service for sublist in df['services'] for service in sublist]))
parking = df['parking'].unique()
price = df['price'].unique()
ribbon_types = df['ribbonType'].unique()
star_types = df['michelinType'].unique()

print(f"\n[category]\n{category}\ntype:{type(category)}")
print(f"\n[services]\n{services}\ntype:{type(services)}")
print(f"\n[parking]\n{parking}\ntype:{type(parking)}")
print(f"\n[price]\n{price}\ntype:{type(price)}")
print(f"\n[ribbon_types]\n{ribbon_types}\ntype:{type(ribbon_types)}")
print(f"\n[star_types]\n{star_types}\ntype:{type(star_types)}")


[category]
['LP바', '가맥집', '가오리', '가이세키', '가자미', '각재기', '간자미', '갈비탕', '갈치', '감자옹심이', '감자탕', '갓포요리', '개성음식', '갯장어', '게국지', '게장', '경상도음식', '경양식', '고기국수', '고등어', '고래', '고로케', '곤드레밥', '골뱅이', '곰장어', '곰치', '곰탕', '곱창국밥', '곱창전골', '과메기', '과일카페', '광동식중식', '구움과자', '국수', '굴', '굴국밥', '굴밥', '굴비', '그리스식', '그릴', '기타', '김밥', '김치찌개', '김치찜', '꼬리곰탕', '꼬막', '꽃게', '꽈배기', '꾹저구', '꿩', '낙지', '남도음식', '남아프리카공화국식', '냉면', '네팔식', '녹차', '뇨키', '뉴아메리칸', '뉴코리안', '다금바리', '다슬기', '다이닝바', '다이닝펍', '다이어트식', '단팥죽', '닭갈비', '닭강정', '닭개장', '닭곰탕', '닭구이', '닭꼬치', '닭내장', '닭도가니탕', '닭똥집', '닭무침', '닭발', '닭백숙', '닭볶음탕', '닭수육', '닭한마리', '대게', '대구', '대구탕', '대만식중식', '대통밥', '대하구이', '더덕', '덕자', '덮밥', '데판야키', '덴푸라', '델리', '도가니탕', '도넛', '도다리쑥국', '도루묵', '도리뱅뱅이', '독일식', '돈가스', '돔베고기', '동남아시아식', '동북식중식', '동태', '돼지갈비', '돼지갈비찜', '돼지고기구이', '돼지고기찌개', '돼지곰탕', '돼지곱창', '돼지국밥', '돼지껍데기', '돼지두루치기', '돼지등갈비', '돼지떡갈비', '돼지막창', '돼지불고기', '돼지족탕', '된장찌개', '두부', '두부두루치기', '두부전골', '디저트/차/베이커리', '디저트카페', '딤섬', '떡', '떡국', '떡볶이', '떡카페', '떡케이크', '라멘', '라면', '라오스식', '라운지바',

In [6]:
df['businessHours'] = df['businessHours'].str.strip()
df['dayOff'] = df['dayOff'].str.strip()
df['menu'] = df['menu'].str.strip()
df['review_b'] = df['review_b'].str.strip()
df['review_m'] = df['review_m'].str.strip()

In [7]:
df['id'] = df['id'].fillna(' ')
df['name'] = df['name'].fillna(' ')
df['category'] = df['category'].fillna(' ')
df['phone'] = df['phone'].fillna(' ')
df['address'] = df['address'].fillna(' ')
df['latitude'] = df['latitude'].fillna(' ')
df['longitude'] = df['longitude'].fillna(' ')
df['businessHours'] = df['businessHours'].fillna(' ')
df['dayOff'] = df['dayOff'].fillna(' ')
df['parking'] = df['parking'].fillna(' ')
df['parkingDetail'] = df['parkingDetail'].fillna(' ')
df['menu'] = df['menu'].fillna(' ')
df['price'] = df['price'].fillna(' ')
df['review_b'] = df['review_b'].fillna(' ')
df['review_m'] = df['review_m'].fillna(' ')
df['ribbonType'] = df['ribbonType'].fillna(' ')
df['michelinType'] = df['michelinType'].fillna(' ')

In [8]:
df = df[['id', 'name', 'category', 'phone', 'address', 'latitude', 'longitude', 
         'businessHours', 'dayOff', 'services', 'parking', 'parkingDetail', 'menu', 
         'price', 'review_b', 'review_m', 'ribbonType', 'michelinType' ]]
df.to_csv('./data/processed.csv', index=False)