# Brickstudy Homework2 -chanyoung

## custom ETL, and process

In [4]:
from pyspark.sql import SparkSession

# Create spark session   # spark session instance 구성 및 생성 # set spark application name, we can identify by name
spark = SparkSession.builder \
    .appName("OnboardingHomework2_C") \
    .getOrCreate()


# Read csv file
article = spark.read.csv("./data/article_info.csv", header=True, inferSchema=True)
view_log = spark.read.csv("./data/view_log.csv", header=True, inferSchema=True)


In [5]:
article.show()
view_log.show()

+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   articleID|               Title|             Content|              Format|            Language|              userID|         userCountry|          userRegion|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|ARTICLE_0000|19 Tips For Every...|"I've been using ...| I once had a man...| edit it a little...| git has a --pret...|              author|            and date|
|ARTICLE_0001|Intel buys comput...|"Intel has acquir...|"" which allows c...| another Internet...| Yogitech works o...| which makes chip...|            software|
|ARTICLE_0002|Practical End-to-...|"One of the reaso...| then the form it...| we'll want to ta...| we can use the b...| we'll see that t...| but we can see t...|
|ARTICLE_0003|Corporate vent

In [43]:
# article_info 의 user_ID를 article.user_id로 변경 
article_selected = article.select(
    col("articleID"),
    col("Title"),
    col("Content"),
    col("userID").alias("article_userID")
)


In [34]:
# Join 데이터프레임
joined_df = view_log.join(article_selected, on="articleID", how="inner")

# 결과 출력
joined_df.show()
print(joined_df.count())

+------------+---------+----------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   articleID|   userID|userRegion|userCountry|               Title|             Content|              Format|            Language|      article_userID|
+------------+---------+----------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|ARTICLE_0000|USER_1243|        SP|         BR|19 Tips For Every...|"I've been using ...| I once had a man...| edit it a little...| git has a --pret...|
|ARTICLE_0000|USER_1144|        MG|         BR|19 Tips For Every...|"I've been using ...| I once had a man...| edit it a little...| git has a --pret...|
|ARTICLE_0000|USER_1019|        SP|         BR|19 Tips For Every...|"I've been using ...| I once had a man...| edit it a little...| git has a --pret...|
|ARTICLE_0000|USER_1005|        SP|         BR|19 Tips For Every...|"I've been usi

In [35]:
from pyspark.sql.functions import col

# 결측치 및 이상치 제거
# 정규 표현식 패턴
user_pattern = r'^USER_\d{4}$'
article_pattern = r'^ARTICLE_\d{4}$'

# "USER_0683" 형태의 userID와 articleID 필터링
join_df_clean = joined_df.filter(
    (col("userID").rlike(user_pattern)) & (col("articleID").rlike(article_pattern))
)
join_df_clean = join_df_clean.filter(col("userRegion") != "Unknown")

join_df_clean = join_df_clean.dropna()

join_df_clean.show()

+------------+---------+----------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   articleID|   userID|userRegion|userCountry|               Title|             Content|              Format|            Language|      article_userID|
+------------+---------+----------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|ARTICLE_0000|USER_1243|        SP|         BR|19 Tips For Every...|"I've been using ...| I once had a man...| edit it a little...| git has a --pret...|
|ARTICLE_0000|USER_1144|        MG|         BR|19 Tips For Every...|"I've been using ...| I once had a man...| edit it a little...| git has a --pret...|
|ARTICLE_0000|USER_1019|        SP|         BR|19 Tips For Every...|"I've been using ...| I once had a man...| edit it a little...| git has a --pret...|
|ARTICLE_0000|USER_1005|        SP|         BR|19 Tips For Every...|"I've been usi

In [36]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline


# StringIndexer 설정
indexers = [
    StringIndexer(inputCol="userRegion", outputCol="userRegionIndex"),
    StringIndexer(inputCol="userCountry", outputCol="userCountryIndex")
]

# Pipeline 설정
pipeline = Pipeline(stages=indexers)

# Pipeline을 사용하여 데이터 변환
model = pipeline.fit(join_df_clean)
encoded_df = model.transform(join_df_clean)

encoded_df.select("userID", "articleID", "userRegionIndex", "userCountryIndex").show()

+---------+------------+---------------+----------------+
|   userID|   articleID|userRegionIndex|userCountryIndex|
+---------+------------+---------------+----------------+
|USER_1243|ARTICLE_0000|            0.0|             0.0|
|USER_1144|ARTICLE_0000|            1.0|             0.0|
|USER_1019|ARTICLE_0000|            0.0|             0.0|
|USER_1005|ARTICLE_0000|            0.0|             0.0|
|USER_0913|ARTICLE_0000|            0.0|             0.0|
|USER_0750|ARTICLE_0000|            0.0|             0.0|
|USER_0695|ARTICLE_0000|            1.0|             0.0|
|USER_0683|ARTICLE_0000|            1.0|             0.0|
|USER_0683|ARTICLE_0000|            1.0|             0.0|
|USER_0625|ARTICLE_0000|            0.0|             0.0|
|USER_0595|ARTICLE_0000|            0.0|             0.0|
|USER_0566|ARTICLE_0000|            1.0|             0.0|
|USER_0566|ARTICLE_0000|            1.0|             0.0|
|USER_0396|ARTICLE_0000|            1.0|             0.0|
|USER_0323|ART

In [37]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# Title과 Content 컬럼 토크나이즈 및 불용어 제거
tokenizer = Tokenizer(inputCol="Title", outputCol="TitleTokens")
content_tokenizer = Tokenizer(inputCol="Content", outputCol="ContentTokens")
remover = StopWordsRemover(inputCol="TitleTokens", outputCol="FilteredTitleTokens")
content_remover = StopWordsRemover(inputCol="ContentTokens", outputCol="FilteredContentTokens")

pipeline = Pipeline(stages=[tokenizer, content_tokenizer, remover, content_remover])
model = pipeline.fit(encoded_df)
tokenized_df = model.transform(encoded_df)

# 결과 확인
tokenized_df.select("userID", "articleID", "FilteredTitleTokens", "FilteredContentTokens").show()


+---------+------------+--------------------+---------------------+
|   userID|   articleID| FilteredTitleTokens|FilteredContentTokens|
+---------+------------+--------------------+---------------------+
|USER_1243|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_1144|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_1019|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_1005|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_0913|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_0750|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_0695|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_0683|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_0683|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_0625|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_0595|ARTICLE_0000|[19, tips, everyd...| ["i've, using, gi...|
|USER_0566|ARTICLE_0000|[19, tips, everyd...| ["

In [40]:
# 데이터 선택 및 저장
result_df = tokenized_df.select(
    col("articleID"),
    col("userID"),
    col("userRegionIndex"),
    col("userCountryIndex"),
    col("FilteredTitleTokens"),
    col("FilteredContentTokens"),

)

In [41]:
output_path = "./result"
result_df.write.parquet(output_path, mode="overwrite")


In [42]:
saved_df = spark.read.parquet(output_path)
saved_df.show()

+------------+---------+---------------+----------------+--------------------+---------------------+
|   articleID|   userID|userRegionIndex|userCountryIndex| FilteredTitleTokens|FilteredContentTokens|
+------------+---------+---------------+----------------+--------------------+---------------------+
|ARTICLE_0753|USER_1357|            1.0|             0.0|[como, usar, o, g...| ["lição, 1:, nave...|
|ARTICLE_0753|USER_1357|            1.0|             0.0|[como, usar, o, g...| ["lição, 1:, nave...|
|ARTICLE_0753|USER_1353|            1.0|             0.0|[como, usar, o, g...| ["lição, 1:, nave...|
|ARTICLE_0753|USER_1317|            0.0|             0.0|[como, usar, o, g...| ["lição, 1:, nave...|
|ARTICLE_0753|USER_1235|            0.0|             0.0|[como, usar, o, g...| ["lição, 1:, nave...|
|ARTICLE_0753|USER_1213|            0.0|             0.0|[como, usar, o, g...| ["lição, 1:, nave...|
|ARTICLE_0753|USER_1210|            0.0|             0.0|[como, usar, o, g...| ["lição, 1:,