### Spark SQL JOIN 연습

https://hongong.hanbit.co.kr/sql-%EA%B8%B0%EB%B3%B8-%EB%AC%B8%EB%B2%95-joininner-outer-cross-self-join/

https://monawa.tistory.com/103

In [None]:
# 스파크 세션 생성
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark study - 231221") \
    .getOrCreate()

In [2]:
# 데이터프레임 생성
df_genome_scores = spark.read.csv("file:///home/kjh/data/ml-latest/genome-scores.csv", header=True, inferSchema=True)
df_genome_tags = spark.read.csv("file:///home/kjh/data/ml-latest/genome-tags.csv", header=True, inferSchema=True)
df_movies = spark.read.csv("file:///home/kjh/data/ml-latest/movies.csv", header=True, inferSchema=True)
df_ratings = spark.read.csv("file:///home/kjh/data/ml-latest/ratings.csv", header=True, inferSchema=True)
df_tags = spark.read.csv("file:///home/kjh/data/ml-latest/tags.csv", header=True, inferSchema=True)

                                                                                

In [3]:
df_genome_scores.show(4)
df_genome_tags.show(4)

+-------+-----+--------------------+
|movieId|tagId|           relevance|
+-------+-----+--------------------+
|      1|    1| 0.03199999999999997|
|      1|    2|0.022249999999999992|
|      1|    3|                0.07|
|      1|    4|               0.059|
+-------+-----+--------------------+
only showing top 4 rows

+-----+------------+
|tagId|         tag|
+-----+------------+
|    1|         007|
|    2|007 (series)|
|    3|18th century|
|    4|       1920s|
+-----+------------+
only showing top 4 rows



In [4]:
print(df_genome_scores.count())
print(df_genome_tags.count())

18472128
1128


In [5]:
df_movies.show(4)
df_ratings.show(4)
df_tags.show(4)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
+-------+--------------------+--------------------+
only showing top 4 rows

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      1|   4.0|1225734739|
|     1|    110|   4.0|1225865086|
|     1|    158|   4.0|1225733503|
|     1|    260|   4.5|1225735204|
+------+-------+------+----------+
only showing top 4 rows

+------+-------+-------------+----------+
|userId|movieId|          tag| timestamp|
+------+-------+-------------+----------+
|    10|    260| good vs evil|1430666558|
|    10|    260|Harrison Ford|1430666505|
|    10|    260|       sci-fi|1430666538|
| 

In [6]:
print(df_movies.count())
print(df_ratings.count())
print(df_tags.count())

86537
33832162
2328315


In [7]:
df_genome_scores.createOrReplaceTempView('g_scores')
df_genome_tags.createOrReplaceTempView('g_tags')
df_movies.createOrReplaceTempView('movies')
df_ratings.createOrReplaceTempView('ratings')
df_tags.createOrReplaceTempView('tags')

In [8]:
# genome 테이블 합치기

df_genome = spark.sql("""
                      SELECT s.movieId, m.title, m.genres, s.tagId, t.tag, s.relevance
                      FROM g_scores s
                      INNER JOIN g_tags t
                      ON s.tagId = t.tagId
                      INNER JOIN movies m
                      ON s.movieId = m.movieId;
                      """)

In [9]:
df_genome.show(4)

+-------+----------------+--------------------+-----+------------+--------------------+
|movieId|           title|              genres|tagId|         tag|           relevance|
+-------+----------------+--------------------+-----+------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|    1|         007| 0.03199999999999997|
|      1|Toy Story (1995)|Adventure|Animati...|    2|007 (series)|0.022249999999999992|
|      1|Toy Story (1995)|Adventure|Animati...|    3|18th century|                0.07|
|      1|Toy Story (1995)|Adventure|Animati...|    4|       1920s|               0.059|
+-------+----------------+--------------------+-----+------------+--------------------+
only showing top 4 rows



In [10]:
# user 테이블 합치기

df_user = spark.sql("""
                    SELECT r.userId, r.movieId, m.title, m.genres, r.rating, r.timestamp AS r_time, t.tag, t.timestamp AS t_time
                    FROM ratings r
                    LEFT JOIN tags t
                    ON r.userId = t.userId 
                    AND r.movieId = t.movieId
                    INNER JOIN movies m
                    ON r.movieId = m.movieId
                    ORDER BY r.userId, r.movieId
                    """)

In [11]:
df_user.show(4)

[Stage 38:>                                                       (0 + 12) / 13]

+------+-------+--------------------+--------------------+------+----------+----+------+
|userId|movieId|               title|              genres|rating|    r_time| tag|t_time|
+------+-------+--------------------+--------------------+------+----------+----+------+
|     1|      1|    Toy Story (1995)|Adventure|Animati...|   4.0|1225734739|NULL|  NULL|
|     1|    110|   Braveheart (1995)|    Action|Drama|War|   4.0|1225865086|NULL|  NULL|
|     1|    158|       Casper (1995)|  Adventure|Children|   4.0|1225733503|NULL|  NULL|
|     1|    260|Star Wars: Episod...|Action|Adventure|...|   4.5|1225735204|NULL|  NULL|
+------+-------+--------------------+--------------------+------+----------+----+------+
only showing top 4 rows



                                                                                

In [12]:
df_genome.createOrReplaceTempView('genome')
df_user.createOrReplaceTempView('user')

In [13]:
df_genome.count()

                                                                                

18472128

In [14]:
spark.sql("""
          SELECT count(*) cnt
          FROM genome
          """).show()



+--------+
|     cnt|
+--------+
|18472128|
+--------+



                                                                                

In [15]:
df_user.count()

                                                                                

35228102

In [16]:
spark.sql("""
          SELECT count(*) cnt
          FROM user
          """).show()



+--------+
|     cnt|
+--------+
|35228102|
+--------+



                                                                                

In [17]:
# Spark SQL 로 NULL 값 체크하기

spark.sql("""
          SELECT
            COUNT(CASE WHEN tag IS NULL THEN 1 END) AS null_count,
            COUNT(CASE WHEN tag IS NOT NULL THEN 1 END) AS not_null_count,
            COUNT(*) AS total_count
          FROM user
          """).show()



+----------+--------------+-----------+
|null_count|not_null_count|total_count|
+----------+--------------+-----------+
|  33498810|       1729292|   35228102|
+----------+--------------+-----------+



                                                                                

In [18]:
print(spark.sql("DESCRIBE genome").show())
print(spark.sql("DESCRIBE user").show())

+---------+---------+-------+
| col_name|data_type|comment|
+---------+---------+-------+
|  movieId|      int|   NULL|
|    title|   string|   NULL|
|   genres|   string|   NULL|
|    tagId|      int|   NULL|
|      tag|   string|   NULL|
|relevance|   double|   NULL|
+---------+---------+-------+

None
+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|  userId|      int|   NULL|
| movieId|      int|   NULL|
|   title|   string|   NULL|
|  genres|   string|   NULL|
|  rating|   double|   NULL|
|  r_time|      int|   NULL|
|     tag|   string|   NULL|
|  t_time|   string|   NULL|
+--------+---------+-------+

None


In [30]:
# genome, user 테이블 합치기

df_merge = spark.sql("""
                     SELECT u.userId, u.movieId, u.title, u.genres, u.rating, u.r_time, u.tag AS u_tag, u.t_time, g.tag AS g_tag, g.relevance
                     FROM user u
                     INNER JOIN genome g
                     ON u.movieId = g.movieId
                     """)

In [31]:
df_merge.count()

                                                                                

39108032976

In [28]:
df_merge.createOrReplaceTempView('merge')

In [29]:
spark.sql("""
          SELECT * 
          FROM merge
          ORDER BY userId, movieId
          """).show()

[Stage 196:>                                                      (0 + 12) / 13]

+------+-------+--------------------+--------------------+------+----------+---------------+----------+---------------+-------------------+
|userId|movieId|               title|              genres|rating|    r_time|          u_tag|    t_time|          g_tag|          relevance|
+------+-------+--------------------+--------------------+------+----------+---------------+----------+---------------+-------------------+
|    10|    260|Star Wars: Episod...|Action|Adventure|...|   4.5|1430666645|         sci-fi|1430666538|         sci-fi| 0.9524999999999999|
|    14|  58559|Dark Knight, The ...|Action|Crime|Dram...|   5.0|1311530004|     psychology|1311530417|     psychology|             0.6865|
|    14|  58559|Dark Knight, The ...|Action|Crime|Dram...|   5.0|1311530004|   imdb top 250|1311530451|   imdb top 250|             0.9775|
|    14|  58559|Dark Knight, The ...|Action|Crime|Dram...|   5.0|1311530004|      superhero|1311530388|      superhero| 0.9790000000000001|
|    14|  58559|Dark

                                                                                