In [95]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as sp_sum, array, concat_ws, regexp_replace, explode, split

In [69]:
# Spark 세션 생성
spark = SparkSession \
    .builder \
    .appName("PySparkTest") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [70]:
# meta_sample.csv 경로
csv_path = '/Users/b06/Desktop/yeardream/medi-05/data/meta_sample.csv'


# 데이터 로드
df = spark.read.csv(csv_path, header=True)

In [71]:
# count Nan
nan_cnt = df.select([sp_sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])


# df에 nan_cnt 열 추가
df_nan_cnt = df.withColumn('nan_count', sum(col(c).isNull().cast("int") for c in df.columns))


# nan_cnt 값이 가장 적은 row
min_nan_row = df_nan_cnt.orderBy("nan_count").first()


# min_nan_row 출력
for k, v in min_nan_row.asDict().items():
    print(f"{k}: {v}")

id: 1070175780
name: 답십리경희한의원
review_settings_keyword: (답십리경희한의원) & (동대문구 | 답십리동 | 한의원-일반) & 답십리경희한의원
keywords_1: 답십리한의원
keywords_2: 전농동한의원
keywords_3: 청량리한의원
keywords_4: 답십리야간진료한의원
keywords_5: 답십리교통사고한의원
menu_name: None
menu_price: None
menu_description: None
menu_images: None
visitor_reviews_total: 1107
photo_review_ratio: 0.044263775971093
fsas_reviews_total: 383
images_count: 25
naver_booking_url: https://m.booking.naver.com/booking/13/bizes/536100
talktalk_url: None
road: -래미안 미드카운티 아파트 맞은편 카운티에비뉴 3층-건물 내 무료주차 가능합니다.
virtual_phone: 0507-1482-7946
phone: None
is_smart_phone: True
is_blog_exposed: True
zeropay_available: False
conveniences_1: 단체 이용 가능
conveniences_2: 주차
conveniences_3: 예약
conveniences_4: 무선 인터넷
conveniences_5: 남/녀 화장실 구분
conveniences_6: 장애인 편의시설
conveniences_7: None
conveniences_8: None
conveniences_9: None
conveniences_10: None
conveniences_11: None
homepages_url: https://blog.naver.com/dapsimni_kmc
homepages_landingUrl: https://blog.naver.com/dapsimni_kmc
homepage

In [72]:
lst = []
a = 'conveniences'
for n in range(1, 12):
    temp = f'{a}_{n}'
    lst.append(temp)
print(lst)

string = ""
for l in lst:
    string += f"'{l}', "
print(string)

['conveniences_1', 'conveniences_2', 'conveniences_3', 'conveniences_4', 'conveniences_5', 'conveniences_6', 'conveniences_7', 'conveniences_8', 'conveniences_9', 'conveniences_10', 'conveniences_11']
'conveniences_1', 'conveniences_2', 'conveniences_3', 'conveniences_4', 'conveniences_5', 'conveniences_6', 'conveniences_7', 'conveniences_8', 'conveniences_9', 'conveniences_10', 'conveniences_11', 


In [73]:
# Select specific columns
selected_columns = [
    'id', 'name', 'keywords_1', 'keywords_2', 'keywords_3', 'keywords_4', 'keywords_5', 
    'review_settings_keyword', 'visitor_reviews_total', 'description',
    'photo_review_ratio', 'fsas_reviews_total', 'images_count', 
    'talktalk_url', 'homepages_type', 'homepages_isDeadUrl', 
    'homepages_isRep', 'description_length', 'self_blog_present', 
    'qna_answer_count', 'crawled_at',
    'conveniences_1', 'conveniences_2', 'conveniences_3', 'conveniences_4', 'conveniences_5', 
    'conveniences_6', 'conveniences_7', 'conveniences_8', 'conveniences_9', 'conveniences_10', 
    'conveniences_11',     
]


# Create a new DataFrame with only the selected columns
df_selected = df.select(selected_columns)

In [98]:
# Table 1: id, review_settings_keyword
table1 = df_selected.select("id", "review_settings_keyword")
table1 = table1.withColumn("review_settings_keyword", regexp_replace("review_settings_keyword", " & ", ", "))
table1 = table1.withColumn("review_settings_keyword", regexp_replace("review_settings_keyword", "[()]", ""))
table1 = table1.withColumn("review_settings_keyword", explode(split(col("review_settings_keyword"), ", ")))

In [100]:
# Table 2: id, keywords
keywords_expr = array([col('keywords_{}'.format(i)) for i in range(1, 6)]).alias('keywords')
table2 = df_selected.select("id", keywords_expr)
table2 = table2.withColumn("keywords", explode(split(concat_ws(", ", "keywords"), ", ")))

In [101]:
# Table 3: id, conveniences
conveniences_expr = array([col('conveniences_{}'.format(i)) for i in range(1, 12)]).alias('conveniences')
table3 = df_selected.select("id", conveniences_expr)
table3 = table3.withColumn("conveniences", explode(split(concat_ws(", ", "conveniences"), ", ")))

In [102]:
# Table 4: id와 나머지 모든 컬럼
exclude_columns = ['review_settings_keyword', 'keywords_1', 'keywords_2', 'keywords_3', 'keywords_4', 'keywords_5',
                   'conveniences_1', 'conveniences_2', 'conveniences_3', 'conveniences_4', 'conveniences_5',
                   'conveniences_6', 'conveniences_7', 'conveniences_8', 'conveniences_9', 'conveniences_10', 'conveniences_11']
table4_columns = [col for col in df_selected.columns if col not in exclude_columns]
table4 = df_selected.select(*table4_columns)

In [103]:
table1.show()

+----------+--------------------------------+
|        id|         review_settings_keyword|
+----------+--------------------------------+
|  19518309|                      송정한의원|
|  19518309|   성동구 | 송정동 | 한의원-일반|
|  19518309|                      송정한의원|
|  20709457|               용산구 소망한의원|
|  19523171|                금호호랑이한의원|
|  19523171|성동구 | 금호동3가 | 한의원-일반|
|  19523171|                금호호랑이한의원|
|  18757770|                      약촌한의원|
|  18757770|   서초구 | 잠원동 | 한의원-일반|
|  18757770|                      약촌한의원|
|1079161835|                      장수한의원|
|1079161835|   종로구 | 창신동 | 한의원-일반|
|1079161835|                      장수한의원|
|1703239053|                  성동예본한의원|
|1703239053|   성동구 | 행당동 | 한의원-일반|
|1703239053|                  성동예본한의원|
|  19529164|                      태평한의원|
|  19529164|     중구 | 신당동 | 한의원-일반|
|  19529164|                      태평한의원|
| 230007709|                    알파스한의원|
+----------+--------------------------------+
only showing top 20 rows



In [104]:
table2.show()

+----------+--------------+
|        id|      keywords|
+----------+--------------+
|  19518309|              |
|  20709457|              |
|  19523171|  금호동한의원|
|  19523171|  금호역한의원|
|  19523171|금남시장한의원|
|  19523171|      도침치료|
|  18757770|              |
|1079161835|  창신동한의원|
|1079161835|  종로구한의원|
|1079161835|      추나교정|
|1079161835|      교통사고|
|1079161835|    동묘한의원|
|1703239053|   8체질한의원|
|1703239053|        한의원|
|1703239053|         8체질|
|1703239053|   왕십리8체질|
|1703239053|  왕십리한의원|
|  19529164|              |
| 230007709|          치매|
|  19518394|              |
+----------+--------------+
only showing top 20 rows



In [105]:
table3.show()

+----------+-----------------+
|        id|     conveniences|
+----------+-----------------+
|  19518309|                 |
|  20709457|                 |
|  19523171|             주차|
|  19523171|      무선 인터넷|
|  18757770|                 |
|1079161835|                 |
|1703239053|             주차|
|1703239053|남/녀 화장실 구분|
|  19529164|                 |
| 230007709|                 |
|  19518394|                 |
|  19529128|                 |
|  36523267|             주차|
|  36523267|             예약|
|  36523267|남/녀 화장실 구분|
|  36523267|  장애인 편의시설|
|  13100633|                 |
|  19518670|                 |
|1524076456|             주차|
|1524076456|             예약|
+----------+-----------------+
only showing top 20 rows



In [107]:
table4.show()

+----------+-----------------+---------------------+------------------------------------+------------------+------------------+------------+--------------------+--------------+-------------------+---------------+------------------+-----------------+----------------+--------------------+
|        id|             name|visitor_reviews_total|                         description|photo_review_ratio|fsas_reviews_total|images_count|        talktalk_url|homepages_type|homepages_isDeadUrl|homepages_isRep|description_length|self_blog_present|qna_answer_count|          crawled_at|
+----------+-----------------+---------------------+------------------------------------+------------------+------------------+------------+--------------------+--------------+-------------------+---------------+------------------+-----------------+----------------+--------------------+
|  19518309|       송정한의원|                   22|                                NULL| 0.409090909090909|                 0|           2| 

In [113]:
data_path = '/Users/b06/Desktop/yeardream/medi-05/data/pyspark-test'
table1_path = f"{data_path}/review_settings_keyword"
table2_path = f"{data_path}/keywords"
table3_path = f"{data_path}/conveniences"
table4_path = f"{data_path}/data"

In [114]:
def save(table, table_path):
    table\
        .write \
        .mode('overwrite') \
        .option("header", "true") \
        .option("charset", "cp949") \
        .csv(table_path)

In [115]:
save(table1, table1_path)
save(table2, table2_path)
save(table3, table3_path)
save(table4, table4_path)