In [1]:
df = spark.read.csv('/Users/zhujinghong/Downloads/5003\ project/pre_data/US_pre.csv',header=True)

In [2]:
df.cache()

DataFrame[category_id: string, trending_date: string, title: string, channel_title: string, publish_time: string, tags: string, views: string, likes: string, dislikes: string, comment_count: string, category_title: string]

In [3]:
df.printSchema()

root
 |-- category_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- category_title: string (nullable = true)



In [4]:
from pyspark.sql.functions import col , column
cols=['category_id','views','likes','dislikes','comment_count']
for col in cols:
    df=df.withColumn('%s'%col ,df[col].cast('int'))

In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import os

In [6]:
# dataframe需要先注册成是一个Table或者View才能继续用SQL语言处理
df.createOrReplaceTempView("df") 

### preprocessing of category_title

In [7]:
# category_title存在null：原因category_id=29不存在
df.select('category_title').distinct().show()
df.select('category_title').distinct().count()
df.select('category_title','category_id').filter(f.isnull('category_title')).show()

+--------------------+
|      category_title|
+--------------------+
|               Shows|
|           Education|
|              Gaming|
|       Entertainment|
|     Travel & Events|
|Science & Technology|
|              Sports|
|       Howto & Style|
|Nonprofits & Acti...|
|    Film & Animation|
|      People & Blogs|
|     News & Politics|
|      Pets & Animals|
|    Autos & Vehicles|
|               Music|
|              Comedy|
+--------------------+

+--------------+-----------+
|category_title|category_id|
+--------------+-----------+
+--------------+-----------+



In [8]:
df.count()

40898

In [9]:
df.filter(df["category_title"]!='').count()

40898

In [10]:
df=df.filter(df["category_title"]!='')

In [11]:
df.createOrReplaceTempView("df") 

# Analysis

### channel_title

In [12]:
# the most popular channel_title
channel = spark.sql("SELECT channel_title, COUNT(*) as freq FROM df GROUP BY channel_title ORDER BY freq DESC")
channel.show(10, False)

+--------------------------------------+----+
|channel_title                         |freq|
+--------------------------------------+----+
|ESPN                                  |202 |
|The Tonight Show Starring Jimmy Fallon|197 |
|Netflix                               |193 |
|TheEllenShow                          |192 |
|Vox                                   |192 |
|The Late Show with Stephen Colbert    |187 |
|Jimmy Kimmel Live                     |185 |
|Late Night with Seth Meyers           |183 |
|Screen Junkies                        |182 |
|NBA                                   |181 |
+--------------------------------------+----+
only showing top 10 rows



In [13]:
# Top popular channel_title every day
cha_day = spark.sql("SELECT trending_date, channel_title, COUNT(*) as freq FROM df GROUP BY trending_date, channel_title ORDER BY freq DESC")
cha_day.show() 

+-------------+--------------------+----+
|trending_date|       channel_title|freq|
+-------------+--------------------+----+
|     18.15.05|Kurzgesagt – In a...|   2|
|     18.15.05|       New Amsterdam|   2|
|     18.22.03|       Kensuke Koike|   1|
|     18.18.03|            The View|   1|
|     18.31.03|     Great Big Story|   1|
|     18.08.06|   Jimmy Kimmel Live|   1|
|     17.22.11|        RealLifeLore|   1|
|     18.15.05|    Marques Brownlee|   1|
|     18.18.02|          Bill Gates|   1|
|     18.24.05|        TheEllenShow|   1|
|     18.16.01|                 NFL|   1|
|     18.30.03|         Matt Stonie|   1|
|     18.22.05|    Lucas and Marcus|   1|
|     18.29.04|                 FBE|   1|
|     18.13.06|        Dude Perfect|   1|
|     17.25.12|            wdwmagic|   1|
|     18.18.02| Half as Interesting|   1|
|     18.28.02|        JamesBayVEVO|   1|
|     18.04.06|       BuzzFeedVideo|   1|
|     18.05.02| Philadelphia Eagles|   1|
+-------------+-------------------

In [14]:
# Average comment_count by channel_title
avg_comment = spark.sql(
    "SELECT channel_title, AVG(comment_count)as avg_comment FROM df GROUP BY channel_title ORDER BY avg_comment DESC")
avg_comment.show()  

+-------------------+------------------+
|      channel_title|       avg_comment|
+-------------------+------------------+
|   Logan Paul Vlogs|         619598.75|
|ChildishGambinoVEVO|         406051.56|
|            ibighit|          397718.3|
|  YouTube Spotlight| 360841.8888888889|
|       David Dobrik|297095.44444444444|
|     The ACE Family|190508.55555555556|
|TheAngryGrandpaShow|        164311.375|
|        Collins Key|         143776.25|
|   jypentertainment|      118367.34375|
|       Call of Duty|103034.87804878049|
|          Manny Mua|          101972.5|
|   ArianaGrandeVevo| 99891.46511627907|
|        Dwarf Mamba|           96720.5|
|      LuisFonsiVEVO|           95342.7|
|        jeffreestar| 79203.52631578948|
|       Desimpedidos|           78330.0|
|  Алексей Навальный|           77462.0|
|CJENMMUSIC Official| 76064.36363636363|
|        TheOdd1sOut| 73067.71428571429|
|         EminemVEVO|           71878.0|
+-------------------+------------------+
only showing top

### category_title

In [15]:
# the number of category_title
distinct_ct = spark.sql("SELECT COUNT(DISTINCT(category_title)) FROM df")
distinct_ct.show()

+------------------------------+
|count(DISTINCT category_title)|
+------------------------------+
|                            16|
+------------------------------+



In [16]:
# 1-1 Top 10 category_title
top_cat = spark.sql("SELECT category_title, COUNT(*) as freq FROM df GROUP BY category_title ORDER BY freq DESC")
top_cat.show(10)   

+--------------------+----+
|      category_title|freq|
+--------------------+----+
|       Entertainment|9944|
|               Music|6467|
|       Howto & Style|4142|
|              Comedy|3453|
|      People & Blogs|3208|
|     News & Politics|2485|
|Science & Technology|2397|
|    Film & Animation|2342|
|              Sports|2170|
|           Education|1655|
+--------------------+----+
only showing top 10 rows



In [17]:
# 1-2 Top popular category_title every day
top_cat_day = spark.sql("SELECT trending_date, category_title, COUNT(*) as freq FROM df GROUP BY trending_date, category_title ORDER BY freq DESC")
top_cat_day.show()  

+-------------+--------------+----+
|trending_date|category_title|freq|
+-------------+--------------+----+
|     18.14.05| Entertainment|  61|
|     18.27.03| Entertainment|  60|
|     18.26.03| Entertainment|  60|
|     18.24.05| Entertainment|  59|
|     18.28.03| Entertainment|  59|
|     17.18.12| Entertainment|  59|
|     18.25.03| Entertainment|  59|
|     18.22.05| Entertainment|  59|
|     18.25.05| Entertainment|  58|
|     18.08.05| Entertainment|  58|
|     18.04.04| Entertainment|  58|
|     18.11.05| Entertainment|  58|
|     18.05.04| Entertainment|  57|
|     17.15.12| Entertainment|  57|
|     18.09.05| Entertainment|  57|
|     18.16.05| Entertainment|  57|
|     18.29.03| Entertainment|  57|
|     18.17.05| Entertainment|  57|
|     18.06.05| Entertainment|  57|
|     18.07.05| Entertainment|  57|
+-------------+--------------+----+
only showing top 20 rows



In [18]:
# 2-1 Average views count by category_title
avg_views = spark.sql(
    "SELECT category_title, AVG(views)as avg_views FROM df GROUP BY category_title ORDER BY avg_views DESC")
avg_views.show()  

+--------------------+------------------+
|      category_title|         avg_views|
+--------------------+------------------+
|               Music|  6204776.02304005|
|    Film & Animation| 3103216.127241674|
|Nonprofits & Acti...|2963884.0701754387|
|              Gaming|2607597.7708333335|
|       Entertainment| 2067689.247988737|
|              Sports|2029128.4059907834|
|      People & Blogs|1530550.0822942643|
|              Comedy|1480239.3831450911|
|Science & Technology|1449087.5064664164|
|    Autos & Vehicles|1355965.4088541667|
|       Howto & Style| 982861.3882182521|
|               Shows| 903527.3333333334|
|     Travel & Events| 855612.4912718205|
|      Pets & Animals| 831143.4663043479|
|           Education|  713097.177039275|
|     News & Politics|  592792.951307847|
+--------------------+------------------+



In [19]:
# 2-2 Average likes count by category_title
avg_likes = spark.sql(
    "SELECT category_title, AVG(likes)as avg_likes FROM df GROUP BY category_title ORDER BY avg_likes DESC")
avg_likes.show()  

+--------------------+------------------+
|      category_title|         avg_likes|
+--------------------+------------------+
|Nonprofits & Acti...|259923.61403508772|
|               Music|219046.34931189113|
|              Gaming| 84288.68137254902|
|    Film & Animation| 70708.76473099914|
|              Comedy|62593.776426295975|
|      People & Blogs| 58089.25654613466|
|       Entertainment|53221.361625100566|
|              Sports| 45444.66036866359|
|       Howto & Style|    39236.09150169|
|Science & Technology| 34315.81518564873|
|           Education|29757.073716012084|
|      Pets & Animals| 21055.11086956522|
|               Shows|18993.666666666668|
|     Travel & Events|12034.127182044887|
|    Autos & Vehicles|11056.395833333334|
|     News & Politics| 7298.667605633803|
+--------------------+------------------+



In [20]:
# 2-3 Average dislikes count by category_title
avg_dislikes = spark.sql(
    "SELECT category_title, AVG(dislikes)as avg_dislikes FROM df GROUP BY category_title ORDER BY avg_dislikes DESC")
avg_dislikes.show()  

+--------------------+------------------+
|      category_title|      avg_dislikes|
+--------------------+------------------+
|Nonprofits & Acti...| 58076.85964912281|
|              Gaming|11227.525735294117|
|               Music| 7912.998917581568|
|       Entertainment| 4314.622184231697|
|      People & Blogs|3172.9625935162094|
|    Film & Animation|2592.2818104184457|
|              Sports|2365.2981566820276|
|              Comedy| 2090.931364031277|
|Science & Technology|1890.6545682102628|
|     News & Politics| 1681.887726358149|
|       Howto & Style|1318.7595364558185|
|     Travel & Events| 848.1047381546134|
|           Education| 816.7021148036254|
|    Autos & Vehicles| 632.8385416666666|
|      Pets & Animals| 573.2380434782609|
|               Shows|429.96491228070175|
+--------------------+------------------+



#### Create df1: trending_date->year, month  

In [21]:
from pyspark.sql.functions import split, explode, concat
df = df.withColumn('trending_date' ,df['trending_date'].cast('string'))
# year
df1 = df.withColumn("date", split(df['trending_date'],''))\
        .withColumn("date", concat(f.col("date")[0], f.col("date")[1]))\
        .withColumnRenamed('date','year')
df1.printSchema()

root
 |-- category_id: integer (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- category_title: string (nullable = true)
 |-- year: string (nullable = true)



In [22]:
# month
df1 = df1.withColumn("date", split(df['trending_date'],''))\
         .withColumn("date", concat(f.col("date")[6], f.col("date")[7]))\
         .withColumnRenamed('date','month')
df1.printSchema()

root
 |-- category_id: integer (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- category_title: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)



In [23]:
df1.createOrReplaceTempView("df1") 

In [24]:
# Top popular category title in every year
top_cat_year = spark.sql("SELECT year, category_title, COUNT(*) as freq FROM df1 GROUP BY year, category_title ORDER BY freq DESC")
top_cat_year.show()  

+----+--------------------+----+
|year|      category_title|freq|
+----+--------------------+----+
|  18|       Entertainment|7605|
|  18|               Music|4904|
|  18|       Howto & Style|3263|
|  18|              Comedy|2559|
|  18|      People & Blogs|2443|
|  17|       Entertainment|2339|
|  18|Science & Technology|1871|
|  18|    Film & Animation|1808|
|  18|     News & Politics|1717|
|  18|              Sports|1697|
|  17|               Music|1563|
|  18|           Education|1293|
|  17|              Comedy| 894|
|  17|       Howto & Style| 879|
|  17|     News & Politics| 768|
|  17|      People & Blogs| 765|
|  18|      Pets & Animals| 737|
|  18|              Gaming| 731|
|  17|    Film & Animation| 534|
|  17|Science & Technology| 526|
+----+--------------------+----+
only showing top 20 rows



In [25]:
top_cat_year.filter(top_cat_year['year']=='17').show()
top_cat_year.filter(top_cat_year['year']=='18').show()

+----+--------------------+----+
|year|      category_title|freq|
+----+--------------------+----+
|  17|       Entertainment|2339|
|  17|               Music|1563|
|  17|              Comedy| 894|
|  17|       Howto & Style| 879|
|  17|     News & Politics| 768|
|  17|      People & Blogs| 765|
|  17|    Film & Animation| 534|
|  17|Science & Technology| 526|
|  17|              Sports| 473|
|  17|           Education| 362|
|  17|      Pets & Animals| 183|
|  17|    Autos & Vehicles| 105|
|  17|     Travel & Events| 101|
|  17|              Gaming|  85|
|  17|Nonprofits & Acti...|  14|
|  17|               Shows|   9|
+----+--------------------+----+

+----+--------------------+----+
|year|      category_title|freq|
+----+--------------------+----+
|  18|       Entertainment|7605|
|  18|               Music|4904|
|  18|       Howto & Style|3263|
|  18|              Comedy|2559|
|  18|      People & Blogs|2443|
|  18|Science & Technology|1871|
|  18|    Film & Animation|1808|
|  18|   

In [26]:
# Top popular category title in 2017 and 2018
df2 = df1.filter(df1['year']=='17')
df3 = df1.filter(df1['year']=='18')
df2.createOrReplaceTempView("df2") 
df3.createOrReplaceTempView("df3") 

In [27]:
# 2017
top_cat_month_17 = spark.sql("SELECT month, category_title,\
                              COUNT(*) as freq FROM df2 GROUP BY month,\
                              category_title ORDER BY freq DESC")
top_cat_month_17.show()

+-----+--------------------+----+
|month|      category_title|freq|
+-----+--------------------+----+
|   12|       Entertainment|1549|
|   12|               Music| 950|
|   11|       Entertainment| 790|
|   11|               Music| 613|
|   12|              Comedy| 579|
|   12|       Howto & Style| 560|
|   12|     News & Politics| 547|
|   12|      People & Blogs| 518|
|   12|    Film & Animation| 360|
|   11|       Howto & Style| 319|
|   11|              Comedy| 315|
|   12|Science & Technology| 289|
|   12|              Sports| 278|
|   11|      People & Blogs| 247|
|   11|Science & Technology| 237|
|   12|           Education| 224|
|   11|     News & Politics| 221|
|   11|              Sports| 195|
|   11|    Film & Animation| 174|
|   11|           Education| 138|
+-----+--------------------+----+
only showing top 20 rows



In [28]:
# 2017.12
top_cat_month_17.select('*').filter(top_cat_month_17['month']=='12').show()

+-----+--------------------+----+
|month|      category_title|freq|
+-----+--------------------+----+
|   12|       Entertainment|1549|
|   12|               Music| 950|
|   12|              Comedy| 579|
|   12|       Howto & Style| 560|
|   12|     News & Politics| 547|
|   12|      People & Blogs| 518|
|   12|    Film & Animation| 360|
|   12|Science & Technology| 289|
|   12|              Sports| 278|
|   12|           Education| 224|
|   12|      Pets & Animals| 127|
|   12|     Travel & Events|  76|
|   12|              Gaming|  63|
|   12|    Autos & Vehicles|  58|
|   12|Nonprofits & Acti...|  13|
|   12|               Shows|   9|
+-----+--------------------+----+



In [29]:
# 2018
top_cat_month_18 = spark.sql("SELECT month, category_title, COUNT(*) as freq FROM df3 GROUP BY month, category_title ORDER BY freq DESC")
top_cat_month_18.show()

+-----+---------------+----+
|month| category_title|freq|
+-----+---------------+----+
|   05|  Entertainment|1681|
|   03|  Entertainment|1537|
|   01|  Entertainment|1288|
|   02|  Entertainment|1243|
|   04|  Entertainment|1160|
|   05|          Music|1116|
|   03|          Music| 810|
|   03|  Howto & Style| 810|
|   02|          Music| 777|
|   01|          Music| 758|
|   04|          Music| 747|
|   06|          Music| 696|
|   06|  Entertainment| 696|
|   05|  Howto & Style| 613|
|   04|  Howto & Style| 557|
|   01|  Howto & Style| 551|
|   03|         Comedy| 524|
|   02|News & Politics| 521|
|   03| People & Blogs| 508|
|   05|         Comedy| 506|
+-----+---------------+----+
only showing top 20 rows



In [30]:
# 2018.05
top_cat_month_18.select('*').filter(top_cat_month_18['month']=='05').show()

+-----+--------------------+----+
|month|      category_title|freq|
+-----+--------------------+----+
|   05|       Entertainment|1681|
|   05|               Music|1116|
|   05|       Howto & Style| 613|
|   05|              Comedy| 506|
|   05|      People & Blogs| 399|
|   05|    Film & Animation| 353|
|   05|Science & Technology| 332|
|   05|              Sports| 330|
|   05|     News & Politics| 274|
|   05|           Education| 206|
|   05|              Gaming| 179|
|   05|      Pets & Animals| 101|
|   05|     Travel & Events|  28|
|   05|               Shows|  19|
|   05|    Autos & Vehicles|   9|
|   05|Nonprofits & Acti...|   5|
+-----+--------------------+----+



### title_length

In [31]:
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructField, StructType
titleLength = spark.sql("SELECT title as title from df")\
                   .rdd.map(lambda x: list(x)).flatMap(lambda x: x)\
                   .map(lambda x: (x,len(x))).reduceByKey(lambda a,b:a)\
                   .repartition(1).sortBy(lambda x:x[1],False)
titleLengthSchema = StructType([StructField("Title", StringType(), True),\
                                StructField("Length", IntegerType(), True)])                                                           
titleLengthDF = spark.createDataFrame(titleLength, titleLengthSchema)
titleLengthDF = titleLengthDF.filter(titleLengthDF["Title"]!='')
titleLengthDF.collect()

[Row(Title="Stephen A.: Cavs 'were an absolute disgrace' in 2nd half of Game 2 vs. Celtics | SportsCenter | ESPN", Length=100),
 Row(Title='The epic late-night Fortnite stream featuring Drake, JuJu Smith-Schuster, Ninja, Travis Scott | ESPN', Length=100),
 Row(Title='Stephen A.: Kevin Durant looked like he wanted no part of LeBron James in Game 1 | First Take | ESPN', Length=100),
 Row(Title='Patrick Beverley calls in to tell Will he knows nothing about basketball | The Will Cain Show | ESPN', Length=100),
 Row(Title='Boxing with Evander Holyfield & Joel McHale | Kevin Hart: What The Fit Ep 8 | Laugh Out Loud Network', Length=100),
 Row(Title='Ronaldo knocks out Juventus with last minute penalty kick | 2017-18 UEFA Champions League Highlights', Length=100),
 Row(Title='Anthony Davis 53 Pts, 17 Rebs, 5 Blks! 2018.02.26 New Orleans Pelicans vs Phoenix Suns | FreeDawkins', Length=100),
 Row(Title='The Twelfth Doctor Regenerates: Peter Capaldi to Jodie Whittaker - Doctor Who Christmas Spec

In [32]:
titleLengthDF.createOrReplaceTempView("titleLengthDF")
length = spark.sql("SELECT Length, COUNT(*) as freq FROM titleLengthDF GROUP BY Length, Length ORDER BY freq DESC")
length.show()

+------+----+
|Length|freq|
+------+----+
|    42| 157|
|    44| 145|
|    48| 144|
|    43| 143|
|    51| 142|
|    41| 140|
|    39| 139|
|    47| 137|
|    45| 131|
|    49| 130|
|    46| 129|
|    36| 127|
|    52| 122|
|    37| 120|
|    54| 119|
|    33| 116|
|    34| 116|
|    50| 116|
|    38| 115|
|    53| 115|
+------+----+
only showing top 20 rows



In [33]:
# word count
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructField, StructType
wordCount = spark.sql("SELECT LOWER(title) as title from df")\
                 .rdd.flatMap(lambda line:line['title'].split()).map(lambda word:(word,1)).reduceByKey(lambda a,b:a+b).repartition(1).sortBy(lambda x:x[1],False)
wordCountSchema = StructType([StructField("word", StringType(), True),StructField("count", IntegerType(), True)])
wordCountDF = spark.createDataFrame(wordCount, wordCountSchema)
wordCountDF = wordCountDF.filter(wordCountDF["word"]!='')
wordCountDF.take(300)

[Row(word='-', count=11440),
 Row(word='|', count=10649),
 Row(word='the', count=9935),
 Row(word='a', count=4682),
 Row(word='to', count=4272),
 Row(word='in', count=3358),
 Row(word='of', count=3129),
 Row(word='with', count=2756),
 Row(word='on', count=2344),
 Row(word='and', count=2296),
 Row(word='&', count=2020),
 Row(word='my', count=2009),
 Row(word='i', count=1988),
 Row(word='trailer', count=1943),
 Row(word='video)', count=1938),
 Row(word='how', count=1816),
 Row(word='for', count=1765),
 Row(word='is', count=1623),
 Row(word='(official', count=1613),
 Row(word='2018', count=1610),
 Row(word='official', count=1589),
 Row(word='you', count=1589),
 Row(word='at', count=1129),
 Row(word='from', count=1123),
 Row(word='ft.', count=1049),
 Row(word='what', count=923),
 Row(word='new', count=908),
 Row(word='this', count=856),
 Row(word='makeup', count=810),
 Row(word='me', count=789),
 Row(word='it', count=774),
 Row(word='music', count=736),
 Row(word='we', count=735),
 Row(wor

### tags

In [34]:
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructField, StructType
wordCount = spark.sql("SELECT LOWER(tags) as tags from df")\
                 .rdd.flatMap(lambda line:line['tags'].split("|"))\
                 .map(lambda word:(word,1)).reduceByKey(lambda a,b:a+b)\
                 .repartition(1).sortBy(lambda x:x[1],False)
wordCountSchema = StructType([StructField("word", StringType(), True),\
                              StructField("count", IntegerType(), True)])
wordCountDF = spark.createDataFrame(wordCount, wordCountSchema)
wordCountDF = wordCountDF.filter(wordCountDF["word"]!='')
wordCountDF.take(300)

[Row(word='"funny"', count=4136),
 Row(word='"comedy"', count=3642),
 Row(word='"how to"', count=1700),
 Row(word='"music"', count=1659),
 Row(word='"pop"', count=1632),
 Row(word='[none]', count=1532),
 Row(word='"trailer"', count=1411),
 Row(word='"food"', count=1279),
 Row(word='"2018"', count=1271),
 Row(word='"news"', count=1253),
 Row(word='"review"', count=1238),
 Row(word='"science"', count=1237),
 Row(word='"makeup"', count=1236),
 Row(word='"humor"', count=1209),
 Row(word='"celebrity"', count=1130),
 Row(word='"diy"', count=1109),
 Row(word='"tutorial"', count=1054),
 Row(word='"video"', count=1029),
 Row(word='"television"', count=1021),
 Row(word='"live"', count=1017),
 Row(word='"interview"', count=988),
 Row(word='"vlog"', count=956),
 Row(word='"entertainment"', count=949),
 Row(word='"animation"', count=945),
 Row(word='"beauty"', count=922),
 Row(word='"movie"', count=908),
 Row(word='"official"', count=895),
 Row(word='"comedian"', count=877),
 Row(word='"cooking"', 

In [35]:
# the most popular tags
tags = spark.sql("SELECT tags, COUNT(*) as freq FROM df GROUP BY tags ORDER BY freq DESC")
tags.show(10)

+------------------------+----+
|                    tags|freq|
+------------------------+----+
|                  [none]|1532|
|    "ABC"|"americanid...|  86|
|    "Jacksfilms"|"Jac...|  79|
|    "James Corden"|"T...|  71|
|    "The Late Show"|"...|  66|
|"BIGHIT"|"빅히트"|"방...|  54|
|    "Super Slow Show"...|  50|
|    "Viral"|"Video"|"...|  46|
|    "best vines 2018"...|  46|
|    "Pentatonix"|"Pen...|  43|
+------------------------+----+
only showing top 10 rows



In [36]:
df.filter("tags==''").show(truncate=False)

+-----------+-------------+-----+-------------+------------+----+-----+-----+--------+-------------+--------------+
|category_id|trending_date|title|channel_title|publish_time|tags|views|likes|dislikes|comment_count|category_title|
+-----------+-------------+-----+-------------+------------+----+-----+-----+--------+-------------+--------------+
+-----------+-------------+-----+-------------+------------+----+-----+-----+--------+-------------+--------------+



##### other data

In [45]:
df = spark.read.csv('/Users/zhujinghong/Downloads/5003\ project/pre_data/US_pre.csv',header=True)

In [46]:
df.createOrReplaceTempView("df") 

In [48]:
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructField, StructType
wordCount = spark.sql("SELECT LOWER(tags) as tags from df")\
                 .rdd.flatMap(lambda line:line['tags'].split("|")).map(lambda word:(word,1)).reduceByKey(lambda a,b:a+b).repartition(1).sortBy(lambda x:x[1],False)
wordCountSchema = StructType([StructField("word", StringType(), True),StructField("count", IntegerType(), True)])
wordCountDF = spark.createDataFrame(wordCount, wordCountSchema)
wordCountDF = wordCountDF.filter(wordCountDF["word"]!='')
wordCountDF.take(300)

[Row(word='"funny"', count=4136),
 Row(word='"comedy"', count=3642),
 Row(word='"how to"', count=1700),
 Row(word='"music"', count=1659),
 Row(word='"pop"', count=1632),
 Row(word='[none]', count=1532),
 Row(word='"trailer"', count=1411),
 Row(word='"food"', count=1279),
 Row(word='"2018"', count=1271),
 Row(word='"news"', count=1253),
 Row(word='"review"', count=1238),
 Row(word='"science"', count=1237),
 Row(word='"makeup"', count=1236),
 Row(word='"humor"', count=1209),
 Row(word='"celebrity"', count=1130),
 Row(word='"diy"', count=1109),
 Row(word='"tutorial"', count=1054),
 Row(word='"video"', count=1029),
 Row(word='"television"', count=1021),
 Row(word='"live"', count=1017),
 Row(word='"interview"', count=988),
 Row(word='"vlog"', count=956),
 Row(word='"entertainment"', count=949),
 Row(word='"animation"', count=945),
 Row(word='"beauty"', count=922),
 Row(word='"movie"', count=908),
 Row(word='"official"', count=895),
 Row(word='"comedian"', count=877),
 Row(word='"cooking"', 

### pubilish_time

In [38]:
pubTime = spark.sql("SELECT publish_time as pub from df")\
               .rdd.flatMap(lambda x: x).map(lambda x: (x,x[11:13]))
pubTimeSchema = StructType([StructField("publish_time", StringType(), True),\
                            StructField("time", StringType(), True)])
pubTimeDF = spark.createDataFrame(pubTime, pubTimeSchema)
pubTimeDF = pubTimeDF.filter(pubTimeDF["time"]!='')
pubTimeDF.createOrReplaceTempView("pubTimeDF")
time_distribution = spark.sql("SELECT time, COUNT(*) as freq FROM pubTimeDF\
                               GROUP BY time, time ORDER BY freq DESC")
time_distribution.show()

+----+----+
|time|freq|
+----+----+
|  16|3664|
|  15|3478|
|  17|3446|
|  18|2885|
|  14|2805|
|  20|2133|
|  19|2124|
|  21|2101|
|  13|2098|
|  22|1957|
|  12|1550|
|  23|1495|
|  00|1436|
|  01|1318|
|  04|1259|
|  05|1252|
|  03| 944|
|  02| 935|
|  11| 855|
|  08| 790|
+----+----+
only showing top 20 rows

