# Yelp Dataset Processing - 100 Records Test
Xử lý dữ liệu Yelp theo format yêu cầu đề tài

In [2]:
import json
import pandas as pd
from datetime import datetime

In [3]:
# Cấu hình đường dẫn - THAY ĐỔI CHỖ NÀY
DATA_PATH = "Yelp/yelp_dataset/"

FILE_PATHS = {
    'business': DATA_PATH + 'yelp_academic_dataset_business.json',
    'review': DATA_PATH + 'yelp_academic_dataset_review.json',
    'user': DATA_PATH + 'yelp_academic_dataset_user.json'
}

N_RECORDS = 100  # Load 100 records từ mỗi file

In [6]:
def load_json_lines(filepath, n_records=100):
    """Load JSON Lines file"""
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if len(data) >= n_records:
                break
            
            line = line.strip()
            if not line:  # Skip empty lines
                continue
            
            try:
                obj = json.loads(line)
                data.append(obj)
            except json.JSONDecodeError as e:
                print(f"⚠️ Error at line {i+1}: {e}")
                continue
    
    df = pd.DataFrame(data)
    print(f"✅ Loaded {len(df)} records from {filepath.split('/')[-1]}")
    return df

## 1. Load Business Data

In [7]:
df_business = load_json_lines(FILE_PATHS['business'], N_RECORDS)
print(f"Columns: {list(df_business.columns)}")
df_business.head()

⚠️ Error at line 1: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
✅ Loaded 100 records from yelp_academic_dataset_business.json
Columns: ['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours']


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


## 2. Load Review Data

In [8]:
df_review = load_json_lines(FILE_PATHS['review'], N_RECORDS)
print(f"Columns: {list(df_review.columns)}")
df_review.head()

✅ Loaded 100 records from yelp_academic_dataset_review.json
Columns: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


## 3. Load User Data

In [9]:
df_user = load_json_lines(FILE_PATHS['user'], N_RECORDS)

# Map yelping_since to since
if 'yelping_since' in df_user.columns:
    df_user['since'] = df_user['yelping_since']

print(f"Columns: {list(df_user.columns)}")
df_user.head()

✅ Loaded 100 records from yelp_academic_dataset_user.json
Columns: ['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos', 'since']


Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,since
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,55,56,18,232,844,467,467,239,180,2007-01-25 16:47:26
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,184,157,251,1847,7054,3131,3131,1521,1946,2009-01-25 04:35:42
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,10,17,3,66,96,119,119,35,18,2008-07-25 10:41:00
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,1,6,2,12,16,26,26,10,9,2005-11-29 04:38:33
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,0,0,0,1,1,0,0,0,0,2007-01-05 19:40:59


## 4. Tạo Training Dataset
Format: review (text) + label (0/1/2)
- Label 0: Tiêu cực (stars 1-2)
- Label 1: Tích cực (stars 4-5)
- Label 2: Trung lập (stars 3)

In [10]:
# Tạo sentiment labels
df_review['label'] = df_review['stars'].apply(lambda x: 
    0 if x <= 2 else (2 if x == 3 else 1)
)

# Tạo training dataset
df_training = pd.DataFrame({
    'review': df_review['text'],
    'label': df_review['label']
})

print(f"\nTraining dataset: {len(df_training)} records")
print(f"\nLabel distribution:")
print(df_training['label'].value_counts().sort_index())

df_training.head()


Training dataset: 100 records

Label distribution:
label
0    12
1    76
2    12
Name: count, dtype: int64


Unnamed: 0,review,label
0,"If you decide to eat here, just be aware it is...",2
1,I've taken a lot of spin classes over the year...,1
2,Family diner. Had the buffet. Eclectic assortm...,2
3,"Wow! Yummy, different, delicious. Our favo...",1
4,Cute interior and owner (?) gave us tour of up...,1


## 5. Kiểm tra dữ liệu

In [11]:
print("=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Business records: {len(df_business)}")
print(f"Review records: {len(df_review)}")
print(f"User records: {len(df_user)}")
print(f"Training records: {len(df_training)}")
print("\nSentiment distribution:")
for label, count in df_training['label'].value_counts().sort_index().items():
    label_name = {0: 'Tiêu cực', 1: 'Tích cực', 2: 'Trung lập'}[label]
    print(f"  {label_name} ({label}): {count}")

SUMMARY
Business records: 100
Review records: 100
User records: 100
Training records: 100

Sentiment distribution:
  Tiêu cực (0): 12
  Tích cực (1): 76
  Trung lập (2): 12


## 6. Export dữ liệu

In [12]:
import os

OUTPUT_DIR = "processed_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Export business
df_business.to_csv(OUTPUT_DIR + 'processed_business.csv', index=False)
print(f"✅ Exported: {OUTPUT_DIR}processed_business.csv")

# Export review
df_review[['review_id', 'user_id', 'business_id', 'stars', 'date', 'text', 'useful']].to_csv(
    OUTPUT_DIR + 'processed_review.csv', index=False
)
print(f"✅ Exported: {OUTPUT_DIR}processed_review.csv")

# Export user
user_cols = ['user_id', 'name', 'review_count', 'since', 'useful', 'fans', 'average_stars']
available_cols = [col for col in user_cols if col in df_user.columns]
df_user[available_cols].to_csv(OUTPUT_DIR + 'processed_user.csv', index=False)
print(f"✅ Exported: {OUTPUT_DIR}processed_user.csv")

# Export training data
df_training.to_csv(OUTPUT_DIR + 'training_data.csv', index=False)
print(f"✅ Exported: {OUTPUT_DIR}training_data.csv")

print(f"\n📁 All files saved to: {OUTPUT_DIR}")

✅ Exported: processed_data/processed_business.csv
✅ Exported: processed_data/processed_review.csv
✅ Exported: processed_data/processed_user.csv
✅ Exported: processed_data/training_data.csv

📁 All files saved to: processed_data/
