In [1]:
df = spark.read.csv('/Users/zhujinghong/Downloads/5003\ project/pre_data/CA_pre.csv',header=True)

In [2]:
df.cache()

DataFrame[category_id: string, trending_date: string, title: string, channel_title: string, publish_time: string, tags: string, views: string, likes: string, dislikes: string, comment_count: string, category_title: string]

In [3]:
df.printSchema()

root
 |-- category_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- category_title: string (nullable = true)



In [4]:
from pyspark.sql.functions import col , column
cols=['category_id','views','likes','dislikes','comment_count']
for col in cols:
    df=df.withColumn('%s'%col ,df[col].cast('int'))

In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import os

In [6]:
# dataframe需要先注册成是一个Table或者View才能继续用SQL语言处理
df.createOrReplaceTempView("df") 

### preprocessing of category_title

In [7]:
# category_title存在null：原因category_id=29不存在
df.select('category_title').distinct().show()
df.select('category_title').distinct().count()
df.select('category_title','category_id').filter(f.isnull('category_title')).show()

+--------------------+
|      category_title|
+--------------------+
|               Shows|
|           Education|
|              Gaming|
|       Entertainment|
|     Travel & Events|
|Science & Technology|
|              Sports|
|                null|
|       Howto & Style|
|    Film & Animation|
|      People & Blogs|
|     News & Politics|
|      Pets & Animals|
|              Movies|
|    Autos & Vehicles|
|               Music|
|              Comedy|
+--------------------+

+--------------+-----------+
|category_title|category_id|
+--------------+-----------+
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|         29|
|          null|       

In [8]:
df.count()

40870

In [9]:
df.filter(df["category_title"]!='').count()

40796

In [10]:
df=df.filter(df["category_title"]!='')

In [11]:
df.createOrReplaceTempView("df") 

# Analysis

### channel_title

In [22]:
# the most popular channel_title
channel = spark.sql("SELECT channel_title, COUNT(*) as freq FROM df GROUP BY channel_title ORDER BY freq DESC")
channel.show(10, False)

+----------------------------------+----+
|channel_title                     |freq|
+----------------------------------+----+
|SET India                         |191 |
|MSNBC                             |189 |
|FBE                               |188 |
|The Young Turks                   |186 |
|REACT                             |183 |
|VikatanTV                         |182 |
|CNN                               |180 |
|The Late Show with Stephen Colbert|172 |
|ARY Digital                       |168 |
|RadaanMedia                       |168 |
+----------------------------------+----+
only showing top 10 rows



In [62]:
# Top popular channel_title every day
cha_day = spark.sql("SELECT trending_date, channel_title, COUNT(*) as freq FROM df GROUP BY trending_date, channel_title ORDER BY freq DESC")
cha_day.show() 

+-------------+-------------------+----+
|trending_date|      channel_title|freq|
+-------------+-------------------+----+
|     17.17.12|    Merry Christmas|   3|
|     18.15.04|        Nicki Minaj|   2|
|     18.14.04|        Nicki Minaj|   2|
|     17.18.12|    Merry Christmas|   2|
|     17.24.11|        Ximo Pierto|   2|
|     17.19.12|    Merry Christmas|   2|
|     18.24.05|       TheEllenShow|   1|
|     17.26.11|purported freelance|   1|
|     17.15.12|                 GQ|   1|
|     18.26.04|    Channel Awesome|   1|
|     18.19.03|        Guava Juice|   1|
|     17.17.11| The King of Random|   1|
|     18.17.02|            DALLMYD|   1|
|     18.20.03|   ikawikawikawikaw|   1|
|     18.26.01| TheStaticShiftVEVO|   1|
|     18.14.02|           NBA Show|   1|
|     18.22.05|   Lucas and Marcus|   1|
|     18.18.04|       Talent Recap|   1|
|     17.29.12|    Linus Tech Tips|   1|
|     18.10.02|      Nick Crompton|   1|
+-------------+-------------------+----+
only showing top

In [63]:
# Average comment_count by channel_title
avg_comment = spark.sql(
    "SELECT channel_title, AVG(comment_count)as avg_comment FROM df GROUP BY channel_title ORDER BY avg_comment DESC")
avg_comment.show()  

+--------------------+------------------+
|       channel_title|       avg_comment|
+--------------------+------------------+
|             ibighit|481934.36842105264|
|   YouTube Spotlight| 415868.3076923077|
|Sơn Tùng M-TP Off...|          350702.0|
|        David Dobrik|          234461.5|
| ChildishGambinoVEVO|         193142.75|
|    ArianaGrandeVevo|155179.16666666666|
|     SuperMarioLogan|          145083.0|
|      The ACE Family|          144699.0|
|    Logan Paul Vlogs|          143946.1|
|         Collins Key|          136154.4|
|           DrakeVEVO|133422.33333333334|
|     TaylorSwiftVEVO|109415.86666666667|
|         Queen Naija|108976.33333333333|
|    jypentertainment|          106640.1|
|Marvel Entertainment|105739.64102564103|
|          Bruno Mars|           89598.0|
|            Lil pump| 88006.84615384616|
|   Salman Khan Films|           72696.2|
|Saad Lamjarred | ...|           72228.2|
|         TheOdd1sOut|           70243.4|
+--------------------+------------

### category_title

In [12]:
# the number of category_title
distinct_ct = spark.sql("SELECT COUNT(DISTINCT(category_title)) FROM df")
distinct_ct.show()

+------------------------------+
|count(DISTINCT category_title)|
+------------------------------+
|                            16|
+------------------------------+



In [13]:
# 1-1 Top 10 category_title
top_cat = spark.sql("SELECT category_title, COUNT(*) as freq FROM df GROUP BY category_title ORDER BY freq DESC")
top_cat.show(10)   

+--------------------+-----+
|      category_title| freq|
+--------------------+-----+
|       Entertainment|13447|
|     News & Politics| 4159|
|      People & Blogs| 4103|
|              Comedy| 3771|
|               Music| 3731|
|              Sports| 2786|
|    Film & Animation| 2060|
|       Howto & Style| 2006|
|              Gaming| 1344|
|Science & Technology| 1155|
+--------------------+-----+
only showing top 10 rows



In [14]:
# 1-2 Top popular category_title every day
top_cat_day = spark.sql("SELECT trending_date, category_title, COUNT(*) as freq FROM df GROUP BY trending_date, category_title ORDER BY freq DESC")
top_cat_day.show()  

+-------------+--------------+----+
|trending_date|category_title|freq|
+-------------+--------------+----+
|     18.20.03| Entertainment|  89|
|     18.16.05| Entertainment|  85|
|     18.17.05| Entertainment|  83|
|     18.15.05| Entertainment|  80|
|     18.27.05| Entertainment|  80|
|     18.18.05| Entertainment|  78|
|     18.01.06| Entertainment|  76|
|     18.29.05| Entertainment|  76|
|     18.08.05| Entertainment|  76|
|     18.31.05| Entertainment|  76|
|     18.06.06| Entertainment|  76|
|     18.09.05| Entertainment|  75|
|     18.18.03| Entertainment|  75|
|     18.27.03| Entertainment|  74|
|     18.20.05| Entertainment|  74|
|     18.19.03| Entertainment|  74|
|     18.19.05| Entertainment|  74|
|     18.23.04| Entertainment|  74|
|     18.24.03| Entertainment|  74|
|     18.09.01| Entertainment|  74|
+-------------+--------------+----+
only showing top 20 rows



In [15]:
# 2-1 Average views count by category_title
avg_views = spark.sql(
    "SELECT category_title, AVG(views)as avg_views FROM df GROUP BY category_title ORDER BY avg_views DESC")
avg_views.show()  

+--------------------+------------------+
|      category_title|         avg_views|
+--------------------+------------------+
|               Music|3532524.8442776734|
|              Movies|         2853415.0|
|    Film & Animation|1426728.5650485437|
|Science & Technology|1233844.5203463202|
|              Sports|1075963.6051687007|
|       Entertainment|1016599.6940581542|
|              Comedy| 982248.0766374967|
|              Gaming| 923759.2150297619|
|      People & Blogs| 786733.5622715086|
|       Howto & Style| 781356.9496510469|
|      Pets & Animals| 638461.1734417344|
|               Shows| 577012.1612903225|
|    Autos & Vehicles| 567784.2840909091|
|           Education| 536602.7679112008|
|     News & Politics| 388220.7364751142|
|     Travel & Events| 366701.4081632653|
+--------------------+------------------+



In [32]:
# 2-2 Average likes count by category_title
avg_likes = spark.sql(
    "SELECT category_title, AVG(likes)as avg_likes FROM df GROUP BY category_title ORDER BY avg_likes DESC")
avg_likes.show()  

+--------------------+------------------+
|      category_title|         avg_likes|
+--------------------+------------------+
|               Music|151285.85633878317|
|              Comedy|51895.282683638296|
|              Movies|42150.833333333336|
|              Gaming|38487.017857142855|
|    Film & Animation| 37767.96262135922|
|Science & Technology| 37068.91601731602|
|       Entertainment|28797.192979846805|
|      People & Blogs| 25685.11991225932|
|       Howto & Style|25494.451146560317|
|      Pets & Animals|23931.533875338755|
|              Sports|21338.610552763817|
|           Education|21162.867810292635|
|    Autos & Vehicles| 13439.15909090909|
|     Travel & Events| 9568.658163265307|
|     News & Politics| 7863.682135128637|
|               Shows| 4752.120967741936|
+--------------------+------------------+



In [33]:
# 2-3 Average dislikes count by category_title
avg_dislikes = spark.sql(
    "SELECT category_title, AVG(dislikes)as avg_dislikes FROM df GROUP BY category_title ORDER BY avg_dislikes DESC")
avg_dislikes.show()  

+--------------------+------------------+
|      category_title|      avg_dislikes|
+--------------------+------------------+
|               Music| 5922.859823103726|
|      People & Blogs| 2310.640506946137|
|              Movies|2046.6666666666667|
|       Entertainment|1917.9835651074588|
|              Gaming|1820.5967261904761|
|Science & Technology|1550.7818181818182|
|    Film & Animation|1510.3199029126213|
|              Comedy|1422.7022010076903|
|              Sports|1179.5987078248386|
|               Shows|  971.258064516129|
|       Howto & Style| 861.2218344965105|
|           Education| 800.6538849646821|
|     News & Politics| 784.8138975715316|
|    Autos & Vehicles| 561.6676136363636|
|      Pets & Animals| 501.3631436314363|
|     Travel & Events| 331.2780612244898|
+--------------------+------------------+



#### Create df1: trending_date->year, month  

In [16]:
from pyspark.sql.functions import split, explode, concat
df = df.withColumn('trending_date' ,df['trending_date'].cast('string'))
# year
df1 = df.withColumn("date", split(df['trending_date'],''))\
        .withColumn("date", concat(f.col("date")[0], f.col("date")[1]))\
        .withColumnRenamed('date','year')
df1.printSchema()

root
 |-- category_id: integer (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- category_title: string (nullable = true)
 |-- year: string (nullable = true)



In [17]:
# month
df1 = df1.withColumn("date", split(df['trending_date'],''))\
         .withColumn("date", concat(f.col("date")[6], f.col("date")[7]))\
         .withColumnRenamed('date','month')
df1.printSchema()

root
 |-- category_id: integer (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- category_title: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)



In [18]:
df1.createOrReplaceTempView("df1") 

In [19]:
# Top popular category title in every year
top_cat_year = spark.sql("SELECT year, category_title, COUNT(*) as freq FROM df1 GROUP BY year, category_title ORDER BY freq DESC")
top_cat_year.show()  

+----+--------------------+-----+
|year|      category_title| freq|
+----+--------------------+-----+
|  18|       Entertainment|10426|
|  18|     News & Politics| 3284|
|  18|      People & Blogs| 3092|
|  17|       Entertainment| 3021|
|  18|               Music| 2847|
|  18|              Comedy| 2747|
|  18|              Sports| 2179|
|  18|    Film & Animation| 1566|
|  18|       Howto & Style| 1557|
|  17|              Comedy| 1024|
|  17|      People & Blogs| 1011|
|  18|              Gaming|  994|
|  17|               Music|  884|
|  17|     News & Politics|  875|
|  18|Science & Technology|  864|
|  18|           Education|  736|
|  17|              Sports|  607|
|  17|    Film & Animation|  494|
|  17|       Howto & Style|  449|
|  17|              Gaming|  350|
+----+--------------------+-----+
only showing top 20 rows



In [20]:
top_cat_year.filter(top_cat_year['year']=='17').show()
top_cat_year.filter(top_cat_year['year']=='18').show()

+----+--------------------+----+
|year|      category_title|freq|
+----+--------------------+----+
|  17|       Entertainment|3021|
|  17|              Comedy|1024|
|  17|      People & Blogs|1011|
|  17|               Music| 884|
|  17|     News & Politics| 875|
|  17|              Sports| 607|
|  17|    Film & Animation| 494|
|  17|       Howto & Style| 449|
|  17|              Gaming| 350|
|  17|Science & Technology| 291|
|  17|           Education| 255|
|  17|     Travel & Events| 102|
|  17|      Pets & Animals|  96|
|  17|    Autos & Vehicles|  72|
|  17|               Shows|  41|
+----+--------------------+----+

+----+--------------------+-----+
|year|      category_title| freq|
+----+--------------------+-----+
|  18|       Entertainment|10426|
|  18|     News & Politics| 3284|
|  18|      People & Blogs| 3092|
|  18|               Music| 2847|
|  18|              Comedy| 2747|
|  18|              Sports| 2179|
|  18|    Film & Animation| 1566|
|  18|       Howto & Style| 1557

In [21]:
# Top popular category title in 2017 and 2018
df2 = df1.filter(df1['year']=='17')
df3 = df1.filter(df1['year']=='18')
df2.createOrReplaceTempView("df2") 
df3.createOrReplaceTempView("df3") 

In [22]:
# 2017
top_cat_month_17 = spark.sql("SELECT month, category_title,\
                              COUNT(*) as freq FROM df2 GROUP BY month,\
                              category_title ORDER BY freq DESC")
top_cat_month_17.show()

+-----+--------------------+----+
|month|      category_title|freq|
+-----+--------------------+----+
|   12|       Entertainment|1949|
|   11|       Entertainment|1072|
|   12|      People & Blogs| 649|
|   12|              Comedy| 638|
|   12|               Music| 592|
|   12|     News & Politics| 568|
|   11|              Comedy| 386|
|   12|              Sports| 371|
|   11|      People & Blogs| 362|
|   12|    Film & Animation| 328|
|   11|     News & Politics| 307|
|   12|       Howto & Style| 302|
|   11|               Music| 292|
|   12|              Gaming| 257|
|   11|              Sports| 236|
|   12|           Education| 173|
|   11|    Film & Animation| 166|
|   12|Science & Technology| 164|
|   11|       Howto & Style| 147|
|   11|Science & Technology| 127|
+-----+--------------------+----+
only showing top 20 rows



In [23]:
# 2017.12
top_cat_month_17.select('*').filter(top_cat_month_17['month']=='12').show()

+-----+--------------------+----+
|month|      category_title|freq|
+-----+--------------------+----+
|   12|       Entertainment|1949|
|   12|      People & Blogs| 649|
|   12|              Comedy| 638|
|   12|               Music| 592|
|   12|     News & Politics| 568|
|   12|              Sports| 371|
|   12|    Film & Animation| 328|
|   12|       Howto & Style| 302|
|   12|              Gaming| 257|
|   12|           Education| 173|
|   12|Science & Technology| 164|
|   12|     Travel & Events|  68|
|   12|      Pets & Animals|  61|
|   12|    Autos & Vehicles|  37|
|   12|               Shows|  24|
+-----+--------------------+----+



In [24]:
# 2018
top_cat_month_18 = spark.sql("SELECT month, category_title, COUNT(*) as freq FROM df3 GROUP BY month, category_title ORDER BY freq DESC")
top_cat_month_18.show()

+-----+---------------+----+
|month| category_title|freq|
+-----+---------------+----+
|   05|  Entertainment|2230|
|   03|  Entertainment|2132|
|   01|  Entertainment|1841|
|   02|  Entertainment|1736|
|   04|  Entertainment|1520|
|   06|  Entertainment| 967|
|   01|News & Politics| 678|
|   03| People & Blogs| 670|
|   05|News & Politics| 641|
|   01| People & Blogs| 628|
|   05|          Music| 612|
|   02|News & Politics| 598|
|   02| People & Blogs| 580|
|   03|         Comedy| 549|
|   04|News & Politics| 535|
|   05|         Comedy| 531|
|   01|          Music| 526|
|   04| People & Blogs| 518|
|   03|News & Politics| 515|
|   02|         Comedy| 507|
+-----+---------------+----+
only showing top 20 rows



In [25]:
# 2018.05
top_cat_month_18.select('*').filter(top_cat_month_18['month']=='05').show()

+-----+--------------------+----+
|month|      category_title|freq|
+-----+--------------------+----+
|   05|       Entertainment|2230|
|   05|     News & Politics| 641|
|   05|               Music| 612|
|   05|              Comedy| 531|
|   05|      People & Blogs| 490|
|   05|              Sports| 346|
|   05|    Film & Animation| 317|
|   05|       Howto & Style| 311|
|   05|Science & Technology| 220|
|   05|              Gaming| 148|
|   05|           Education| 126|
|   05|     Travel & Events|  56|
|   05|      Pets & Animals|  54|
|   05|    Autos & Vehicles|  49|
|   05|               Shows|  34|
|   05|              Movies|   1|
+-----+--------------------+----+



### title_length

In [55]:
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructField, StructType
titleLength = spark.sql("SELECT title as title from df")\
                   .rdd.map(lambda x: list(x)).flatMap(lambda x: x)\
                   .map(lambda x: (x,len(x))).reduceByKey(lambda a,b:a)\
                   .repartition(1).sortBy(lambda x:x[1],False)
titleLengthSchema = StructType([StructField("Title", StringType(), True),\
                                StructField("Length", IntegerType(), True)])                                                           
titleLengthDF = spark.createDataFrame(titleLength, titleLengthSchema)
titleLengthDF = titleLengthDF.filter(titleLengthDF["Title"]!='')
titleLengthDF.collect()

[Row(Title='Stephen A. asks Ray Allen about relationship with Rajon Rondo and former Celtics | First Take | ESPN', Length=100),
 Row(Title='Emotional Virat Kohli CRIES In Front Of Anushka Sharma Night Before Wedding. Watch What Anushka Does', Length=100),
 Row(Title='88RISING - midsummer madness ft. Joji, Rich Brian, Higher Brothers, AUGUST 08 (official music video)', Length=100),
 Row(Title='Guitar Music Cover - Greatest Love Songs Of All Time -Love Songs Greatest Hits - Most Beautiful Love', Length=100),
 Row(Title='Hát mãi ước mơ 2| tập 4 full:Cẩm Ly ngưỡng mộ niềm lạc quan của đôi vợ chồng 40 năm làm nghề vớt xác', Length=100),
 Row(Title="LA PIRE PARTIE ft Le Rire Jaune, Pierre Croce, Fabien Olicard, Nad Rich' Hard, Max Bird, Studio Vrac", Length=100),
 Row(Title='About that life (Full Video) Elly Mangat I Vadda Grewal | Only Jashan |  | Latest Punjabi Songs 2018', Length=100),
 Row(Title='Charlotte Flair & Carmella sign contract for their match at Backlash: SmackDown LIVE, April 

In [61]:
titleLengthDF.createOrReplaceTempView("titleLengthDF")
length = spark.sql("SELECT Length, COUNT(*) as freq FROM titleLengthDF GROUP BY Length, Length ORDER BY freq DESC")
length.show()

+------+----+
|Length|freq|
+------+----+
|    43| 556|
|    41| 473|
|    47| 467|
|    51| 461|
|    48| 461|
|    42| 459|
|    45| 455|
|    46| 455|
|    34| 450|
|    44| 449|
|    50| 444|
|    39| 428|
|    58| 422|
|    49| 419|
|    40| 414|
|    54| 409|
|    53| 398|
|    35| 397|
|    36| 397|
|    52| 392|
+------+----+
only showing top 20 rows



In [38]:
# word count
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructField, StructType
wordCount = spark.sql("SELECT LOWER(title) as title from df")\
                 .rdd.flatMap(lambda line:line['title'].split()).map(lambda word:(word,1)).reduceByKey(lambda a,b:a+b).repartition(1).sortBy(lambda x:x[1],False)
wordCountSchema = StructType([StructField("word", StringType(), True),StructField("count", IntegerType(), True)])
wordCountDF = spark.createDataFrame(wordCount, wordCountSchema)
wordCountDF = wordCountDF.filter(wordCountDF["word"]!='')
wordCountDF.take(300)

[Row(word='-', count=12659),
 Row(word='|', count=12546),
 Row(word='the', count=8318),
 Row(word='2018', count=3748),
 Row(word='to', count=3740),
 Row(word='in', count=2948),
 Row(word='a', count=2800),
 Row(word='of', count=2670),
 Row(word='episode', count=2445),
 Row(word='&', count=2284),
 Row(word='with', count=2222),
 Row(word='on', count=2221),
 Row(word='and', count=2217),
 Row(word='for', count=1504),
 Row(word='is', count=1453),
 Row(word='vs', count=1430),
 Row(word='full', count=1395),
 Row(word='you', count=1268),
 Row(word='2017', count=1194),
 Row(word='game', count=1172),
 Row(word='new', count=1141),
 Row(word='trump', count=1037),
 Row(word='2', count=1030),
 Row(word='video)', count=973),
 Row(word='official', count=965),
 Row(word='i', count=944),
 Row(word='trailer', count=917),
 Row(word='at', count=897),
 Row(word='10', count=871),
 Row(word='my', count=835),
 Row(word='all', count=812),
 Row(word='(official', count=811),
 Row(word='music', count=801),
 Row(wor

### tags

In [141]:
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructField, StructType
wordCount = spark.sql("SELECT LOWER(tags) as tags from df")\
                 .rdd.flatMap(lambda line:line['tags'].split("|"))\
                 .map(lambda word:(word,1)).reduceByKey(lambda a,b:a+b)\
                 .repartition(1).sortBy(lambda x:x[1],False)
wordCountSchema = StructType([StructField("word", StringType(), True),\
                              StructField("count", IntegerType(), True)])
wordCountDF = spark.createDataFrame(wordCount, wordCountSchema)
wordCountDF = wordCountDF.filter(wordCountDF["word"]!='')
wordCountDF.take(300)

[Row(word='"funny"', count=3890),
 Row(word='"comedy"', count=2953),
 Row(word='[none]', count=2384),
 Row(word='"news"', count=2180),
 Row(word='"politics"', count=1471),
 Row(word='"donald trump"', count=1345),
 Row(word='"music"', count=1153),
 Row(word='"2018"', count=1147),
 Row(word='"video"', count=1141),
 Row(word='"entertainment"', count=1123),
 Row(word='"trump"', count=1120),
 Row(word='"food"', count=1090),
 Row(word='"humor"', count=1033),
 Row(word='"review"', count=1013),
 Row(word='"talk show"', count=947),
 Row(word='"nba"', count=918),
 Row(word='"reaction"', count=918),
 Row(word='"interview"', count=884),
 Row(word='"television"', count=851),
 Row(word='"trailer"', count=848),
 Row(word='"rap"', count=839),
 Row(word='"drama"', count=828),
 Row(word='"family friendly"', count=775),
 Row(word='"2017"', count=767),
 Row(word='"comedian"', count=757),
 Row(word='"how to"', count=732),
 Row(word='"tv"', count=713),
 Row(word='"react"', count=697),
 Row(word='"basketball

In [24]:
# the most popular tags
tags = spark.sql("SELECT tags, COUNT(*) as freq FROM df GROUP BY tags ORDER BY freq DESC")
tags.show(10)

+------------------------------+----+
|                          tags|freq|
+------------------------------+----+
|                        [none]|2378|
|          "the real"|"dayti...| 127|
|          "best vines 2018"...| 120|
|          "Vaani rani"|"ran...| 116|
|          "senegal"|"video"...|  85|
|          "hkayet tounsia"|...|  74|
|          "reaction time"|"...|  72|
|"江苏卫视"|"非诚勿扰"|"姜振...|  72|
|          "sab tv channel"|...|  70|
|          "The Late Show"|"...|  69|
+------------------------------+----+
only showing top 10 rows



In [41]:
df.filter("tags==''").show(truncate=False)

+-----------+-------------+-----+-------------+------------+----+-----+-----+--------+-------------+--------------+
|category_id|trending_date|title|channel_title|publish_time|tags|views|likes|dislikes|comment_count|category_title|
+-----------+-------------+-----+-------------+------------+----+-----+-----+--------+-------------+--------------+
+-----------+-------------+-----+-------------+------------+----+-----+-----+--------+-------------+--------------+



##### other data

In [45]:
df = spark.read.csv('/Users/zhujinghong/Downloads/5003\ project/pre_data/US_pre.csv',header=True)

In [46]:
df.createOrReplaceTempView("df") 

In [48]:
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructField, StructType
wordCount = spark.sql("SELECT LOWER(tags) as tags from df")\
                 .rdd.flatMap(lambda line:line['tags'].split("|")).map(lambda word:(word,1)).reduceByKey(lambda a,b:a+b).repartition(1).sortBy(lambda x:x[1],False)
wordCountSchema = StructType([StructField("word", StringType(), True),StructField("count", IntegerType(), True)])
wordCountDF = spark.createDataFrame(wordCount, wordCountSchema)
wordCountDF = wordCountDF.filter(wordCountDF["word"]!='')
wordCountDF.take(300)

[Row(word='"funny"', count=4136),
 Row(word='"comedy"', count=3642),
 Row(word='"how to"', count=1700),
 Row(word='"music"', count=1659),
 Row(word='"pop"', count=1632),
 Row(word='[none]', count=1532),
 Row(word='"trailer"', count=1411),
 Row(word='"food"', count=1279),
 Row(word='"2018"', count=1271),
 Row(word='"news"', count=1253),
 Row(word='"review"', count=1238),
 Row(word='"science"', count=1237),
 Row(word='"makeup"', count=1236),
 Row(word='"humor"', count=1209),
 Row(word='"celebrity"', count=1130),
 Row(word='"diy"', count=1109),
 Row(word='"tutorial"', count=1054),
 Row(word='"video"', count=1029),
 Row(word='"television"', count=1021),
 Row(word='"live"', count=1017),
 Row(word='"interview"', count=988),
 Row(word='"vlog"', count=956),
 Row(word='"entertainment"', count=949),
 Row(word='"animation"', count=945),
 Row(word='"beauty"', count=922),
 Row(word='"movie"', count=908),
 Row(word='"official"', count=895),
 Row(word='"comedian"', count=877),
 Row(word='"cooking"', 

### pubilish_time

In [148]:
pubTimeDF.take(300)

[Row(publish_time='2018-03-27T16:08:41.000Z', time='16'),
 Row(publish_time='2018-04-14T19:53:33.000Z', time='19'),
 Row(publish_time='2018-05-02T17:43:56.000Z', time='17'),
 Row(publish_time='2018-02-21T00:55:41.000Z', time='00'),
 Row(publish_time='2018-01-29T03:11:56.000Z', time='03'),
 Row(publish_time='2018-03-27T13:00:02.000Z', time='13'),
 Row(publish_time='2018-05-29T05:37:33.000Z', time='05'),
 Row(publish_time='2018-02-04T09:16:55.000Z', time='09'),
 Row(publish_time='2018-01-24T16:22:24.000Z', time='16'),
 Row(publish_time='2017-12-25T04:29:43.000Z', time='04'),
 Row(publish_time='2017-12-13T06:52:20.000Z', time='06'),
 Row(publish_time='2018-01-19T17:00:05.000Z', time='17'),
 Row(publish_time='2018-04-03T04:52:48.000Z', time='04'),
 Row(publish_time='2018-03-18T11:47:57.000Z', time='11'),
 Row(publish_time='2017-12-01T18:41:10.000Z', time='18'),
 Row(publish_time='2017-12-29T18:30:00.000Z', time='18'),
 Row(publish_time='2018-01-06T06:23:43.000Z', time='06'),
 Row(publish_t

In [149]:
pubTime = spark.sql("SELECT publish_time as pub from df")\
               .rdd.flatMap(lambda x: x).map(lambda x: (x,x[11:13]))
pubTimeSchema = StructType([StructField("publish_time", StringType(), True),\
                            StructField("time", StringType(), True)])
pubTimeDF = spark.createDataFrame(pubTime, pubTimeSchema)
pubTimeDF = pubTimeDF.filter(pubTimeDF["time"]!='')
pubTimeDF.createOrReplaceTempView("pubTimeDF")
time_distribution = spark.sql("SELECT time, COUNT(*) as freq FROM pubTimeDF\
                               GROUP BY time, time ORDER BY freq DESC")
time_distribution.show()

+----+----+
|time|freq|
+----+----+
|  16|3988|
|  17|3206|
|  15|2679|
|  19|2595|
|  20|2595|
|  18|2346|
|  21|2330|
|  22|2284|
|  14|2194|
|  13|1730|
|  23|1698|
|  01|1441|
|  12|1361|
|  04|1325|
|  03|1324|
|  02|1314|
|  00|1265|
|  05|1062|
|  11|1023|
|  08| 690|
+----+----+
only showing top 20 rows

