In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, ArrayType

In [2]:
spark = (
    SparkSession
    .builder
    .appName("Youtube Video Stats Analysis")
    .master("local[*]")
    .config('spark.sql.adaptive.enabled', 'true')
    .getOrCreate()
)

24/05/13 14:06:25 WARN Utils: Your hostname, codespaces-0d4183 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/05/13 14:06:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/13 14:06:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/13 14:06:42 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [4]:
# Define the schema
json_schema = StructType([
    StructField("kind", StringType(), nullable=True),
    StructField("etag", StringType(), nullable=True),
    StructField("items", ArrayType(StructType([
        StructField("kind", StringType(), nullable=True),
        StructField("etag", StringType(), nullable=True),
        StructField("id", StringType(), nullable=True),
        StructField("snippet", StructType([
            StructField("channelId", StringType(), nullable=True),
            StructField("title", StringType(), nullable=True),
            StructField("assignable", BooleanType(), nullable=True)
        ]), nullable=True)
    ])), nullable=True)
])

In [7]:
ca_videos_df = (
    spark
    .read
    .option('header', 'true')
    .csv('../input_data/CAvideos.csv')
)

ca_category_csv_df = (
    spark.read
    .option("header", "true")
    .csv("../input_data/CA_category_id.csv")
)

# not rendering values for some unknown reason
ca_category_df = (
    spark
    .read
    .schema(json_schema)
    .json('../input_data/CA_category_id.json')
)
# Filter out corrupt records
# ca_category_clean_df = ca_category_df.filter(ca_category_df["_corrupt_record"].isNull())
# Cache or save parsed results
ca_category_df.cache()

24/05/13 14:22:00 WARN CacheManager: Asked to cache already cached data.


DataFrame[kind: string, etag: string, items: array<struct<kind:string,etag:string,id:string,snippet:struct<channelId:string,title:string,assignable:boolean>>>]

In [6]:
ca_videos_df.show(5)

+-----------+-------------+--------------------+-------------+-----------+--------------------+--------------------+--------+-------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|channel_title|category_id|        publish_time|                tags|   views|  likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+-------------+-----------+--------------------+--------------------+--------+-------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|n1WpP7iowLc|     17.14.11|Eminem - Walk On ...|   EminemVEVO|         10|2017-11-10T17:00:...|"Eminem"|"Walk"|"...|17158579| 787425|   43420|       125882|https://i.ytimg.c...|            False|           False|                 

In [10]:
ca_videos_df.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [8]:
ca_category_csv_df.show(4)

+---+----------------+
| id|           title|
+---+----------------+
|  1|Film & Animation|
|  2|Autos & Vehicles|
| 10|           Music|
| 15|  Pets & Animals|
+---+----------------+
only showing top 4 rows



## Medium Level

In [12]:
# 1. Find the total number of videos in each category.
video_joined_df = (
    ca_videos_df.alias("video").join(
        ca_category_csv_df.alias("category")
        ,on = (f.col("video.category_id")==f.col("category.id"))
        ,how = "left"
    )
    .selectExpr("video.*", "category.title as category_title")
)

In [15]:
video_count_per_category_df = (
    video_joined_df
    .groupBy('category_title')
    .agg(f.count(f.col('*')).alias('video_count'))
    .orderBy(f.desc('video_count'))
)

video_count_per_category_df.show()

[Stage 15:>                                                         (0 + 2) / 2]

+--------------------+-----------+
|      category_title|video_count|
+--------------------+-----------+
|       Entertainment|      13451|
|                NULL|       4753|
|     News & Politics|       4159|
|      People & Blogs|       4105|
|              Comedy|       3773|
|               Music|       3731|
|              Sports|       2787|
|    Film & Animation|       2060|
|       Howto & Style|       2007|
|              Gaming|       1344|
|Science & Technology|       1155|
|           Education|        991|
|     Travel & Events|        392|
|      Pets & Animals|        369|
|    Autos & Vehicles|        353|
|               Shows|        124|
|              Movies|          6|
+--------------------+-----------+



                                                                                

In [18]:
# 2. Calculate the average number of views, likes, dislikes, and comments for each channel.
avg_likes_dislikes_comment_df = (
    video_joined_df
    .groupBy("channel_title")
    .agg(f.avg('likes').alias("likes_count")
         , f.avg('dislikes').alias("dislikes_count")
         , f.avg('comment_count').alias('comment_count')
        )
    .orderBy(f.desc("likes_count"))
)
avg_likes_dislikes_comment_df.show()



+--------------------+------------------+------------------+------------------+
|       channel_title|       likes_count|    dislikes_count|     comment_count|
+--------------------+------------------+------------------+------------------+
|             ibighit|2645256.5263157897|           48049.0|481934.36842105264|
|           DrakeVEVO|         2027493.0|           22755.5|133422.33333333334|
| ChildishGambinoVEVO|         1824239.5|           86243.5|         193142.75|
|    ArianaGrandeVevo|         1616078.0|53161.666666666664|155179.16666666666|
|        David Dobrik|1426158.8333333333| 67102.16666666667|          234461.5|
|   YouTube Spotlight|1298430.3846153845| 684501.5384615385| 415868.3076923077|
|     TaylorSwiftVEVO|         1218075.4|64185.666666666664|109415.86666666667|
|          Bruno Mars|1211541.6666666667|39243.333333333336|           89598.0|
|       LuisFonsiVEVO| 894564.6666666666|38091.666666666664|52159.444444444445|
|           Bad Bunny|          886007.0

                                                                                

In [24]:
# 3. Identify the top 10 trending videos based on the highest number of views.
top_trending_liked_videos_df = (
    video_joined_df
    .orderBy(f.desc("views"))
    .limit(10)
)
top_trending_liked_videos_df.show(3, truncate=False)

[Stage 41:>                                                         (0 + 2) / 2]

+----------------------+---------------------------------------+----------------------+--------------------+-------------------------------------+------------------------------------+------------------------------------+-----------------------------------------+---------------------------------------+------------------------------------------+---------------------------------------+---------------------------------------------+-----------------------------------------------+-----------------------+----------------------+-----------+--------------+
|video_id              |trending_date                          |title                 |channel_title       |category_id                          |publish_time                        |tags                                |views                                    |likes                                  |dislikes                                  |comment_count                          |thumbnail_link                               |comments_disabl

                                                                                

In [47]:
# 4. Investigate the distribution of trending videos over different days of the week
# video_joined_df.show(2)
split_date = f.split(video_joined_df["trending_date"], "\\.")
new_date_format = f.concat_ws("-", split_date[0], split_date[2], split_date[1])

trending_video_over_day_of_week_df = (
    video_joined_df
    .withColumn("day_of_week_new1", f.to_date(f.col('trending_date'), "yy.MM.dd"))
    .withColumn("trending_date_new2", f.to_date(new_date_format, "yy-MM-dd"))
    .withColumn("trending_day_of_week", f.date_format(f.col("trending_date_new2"), "EEEE"))
    .groupBy('trending_day_of_week')
    .agg(f.count('video_id').alias('trending_videos_count'))
    .orderBy(f.desc('trending_videos_count'))
    
)
trending_video_over_day_of_week_df.show()

+--------------------+---------------------+
|trending_day_of_week|trending_videos_count|
+--------------------+---------------------+
|            Saturday|                 5990|
|             Tuesday|                 5988|
|              Sunday|                 5794|
|              Friday|                 5787|
|            Thursday|                 5783|
|           Wednesday|                 5780|
|              Monday|                 5759|
|                NULL|                 4679|
+--------------------+---------------------+



[Stage 71:>                                                         (0 + 2) / 2]                                                                                

In [37]:
new_date_format.show()

TypeError: 'Column' object is not callable