# Label Encoding



> business_id, review_id, user_id에 적용



## Dataset open

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install polars



In [None]:
import polars as pl
import pandas as pd

In [None]:
review_df = pl.read_ndjson("/content/drive/MyDrive/ASAC 5기/woowahan/dataset/yelp_academic_dataset_review.json")

In [None]:
#review_df = pl.read_csv("/content/drive/MyDrive/ASAC 5기/woowahan/dataset/review_filter_final.csv")

In [None]:
user_df = pl.read_ndjson("/content/drive/MyDrive/ASAC 5기/woowahan/dataset/yelp_academic_dataset_user.json")

In [None]:
biz_df = pl.read_ndjson('/content/drive/MyDrive/ASAC 5기/woowahan/dataset/yelp_academic_dataset_business.json')

In [None]:
#biz_df = pd.read_excel('/content/drive/MyDrive/ASAC 5기/woowahan/dataset/business_add_category_v2_final.xlsx')

In [None]:
review_df=review_df.to_pandas()

In [None]:
user_df=user_df.to_pandas()

In [None]:
biz_df=biz_df.to_pandas()

In [None]:
biz_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62590 entries, 0 to 62589
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    62590 non-null  int64  
 1   business_id   62590 non-null  object 
 2   name          62590 non-null  object 
 3   address       61900 non-null  object 
 4   city          62590 non-null  object 
 5   state         62590 non-null  object 
 6   postal_code   62564 non-null  object 
 7   latitude      62590 non-null  float64
 8   longitude     62590 non-null  float64
 9   stars         62590 non-null  float64
 10  review_count  62590 non-null  int64  
 11  is_open       62590 non-null  int64  
 12  attributes    61761 non-null  object 
 13  categories    62590 non-null  object 
 14  hours         53603 non-null  object 
 15  main1         62590 non-null  object 
 16  main2         26993 non-null  object 
 17  main1_sub1    60984 non-null  object 
 18  main1_sub2    31112 non-nu

In [None]:
biz_df=biz_df.drop(columns=['Unnamed: 0'], axis=0)

In [None]:
review_df.head()

## 카테고리 수 확인

In [None]:
print("유저 테이블 유저 수 : ")
print(user_df['user_id'].nunique())

print("리뷰 테이블 유저 수 : ")
print(review_df['user_id'].nunique())

print("비즈니스 테이블 비즈니스 수 : ")
print(biz_df['business_id'].nunique())

print("리뷰 테이블 비즈니스 수 : ")
print(review_df['business_id'].nunique())

유저 테이블 유저 수 : 
1987897
리뷰 테이블 유저 수 : 
974982
비즈니스 테이블 비즈니스 수 : 
62590
리뷰 테이블 비즈니스 수 : 
53454


In [None]:
# 리뷰 테이블의 user_id 중 유저 테이블에 없는 user_id를 확인하는 함수
def find_missing_user_ids(review_df, user_df):
    review_user_ids = set(review_df['user_id'])
    user_user_ids = set(user_df['user_id'])

    missing_user_ids = user_user_ids - review_user_ids
    return missing_user_ids

# 함수 호출
missing_user_ids = find_missing_user_ids(review_df, user_df)

# 결과 출력
print("User IDs in user_df but not in review_df:")
print(missing_user_ids)

In [None]:
# 리뷰 테이블의 business_id 중 비즈니스 테이블에 없는 business_id 확인하는 함수
def find_missing_biz_ids(review_df, biz_df):
    review_biz_ids = set(review_df['business_id'])
    biz_biz_ids = set(biz_df['business_id'])

    missing_biz_ids = biz_biz_ids - review_biz_ids
    return missing_biz_ids

# 함수 호출
missing_biz_ids = find_missing_biz_ids(review_df, biz_df)

# 결과 출력
print("Business IDs in biz_df but not in review_df:")
print(missing_biz_ids)

Business IDs in biz_df but not in review_df:
set()


## 각 df의 id값 인코딩

In [None]:
## 각 df의 id값 인코딩 함수

from sklearn.preprocessing import LabelEncoder

def label_encode_column(df, column, encoder=None):
    if encoder is None:
        encoder = LabelEncoder()
        df[column] = encoder.fit_transform(df[column])
    else:
        df[column] = encoder.transform(df[column])
    return encoder

In [None]:
## 각 테이블에 인코딩 적용 후 review_df의 user_id와 biz_id에도 동일한 인코더 적용

def encode_review_data(review_df, user_df, biz_df):
    # 라벨 인코더 초기화
    user_encoder = label_encode_column(review_df, 'user_id')
    biz_encoder = label_encode_column(review_df, 'business_id')
    review_encoder = label_encode_column(review_df, 'review_id')

    # 리뷰 데이터프레임에 인코더 적용
    label_encode_column(user_df, 'user_id', user_encoder)
    label_encode_column(biz_df, 'business_id', biz_encoder)

    return review_df, user_df, biz_df

In [None]:
# 함수 호출
encoded_review_df, encoded_user_df, encoded_biz_df = encode_review_data(review_df, user_df, biz_df)

## 결과 csv 저장

In [None]:
#encoded_review_df.to_csv('/content/drive/MyDrive/ASAC 5기/woowahan/dataset/encoded_review_df.csv', index=False)

In [None]:
#encoded_user_df.to_csv('/content/drive/MyDrive/ASAC 5기/woowahan/dataset/encoded_user_df.csv', index=False)

In [None]:
#encoded_biz_df.to_csv('/content/drive/MyDrive/ASAC 5기/woowahan/dataset/encoded_biz_df.csv', index = False)