In [1]:
import pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

import pandas as pd
import random


In [79]:
spark = SparkSession.builder.master("local") \
    .config('spark.sql.autoBroadcastJoinThreshold', 0) \
    .config('spark.sql.adaptive.enabled', 'false') \
    .getOrCreate()

In [80]:
videos = spark.read.option('header', 'true').option("inferSchema", "true").csv('../datasets/USvideos.csv')
videos.show()

+-----------+--------------------+--------------------+-----------+--------------------+-------+------+--------+-------------+--------------------+-----+
|   video_id|               title|       channel_title|category_id|                tags|  views| likes|dislikes|comment_total|      thumbnail_link| date|
+-----------+--------------------+--------------------+-----------+--------------------+-------+------+--------+-------------+--------------------+-----+
|XpVt6Z1Gjjo|1 YEAR OF VLOGGIN...|    Logan Paul Vlogs|         24|logan paul vlog|l...|4394029|320053|    5931|        46245|https://i.ytimg.c...|13.09|
|K4wEI5zhHB0|iPhone X — Introd...|               Apple|         28|Apple|iPhone 10|i...|7860119|185853|   26679|            0|https://i.ytimg.c...|13.09|
|cLdxuaxaQwc|         My Response|           PewDiePie|         22|              [none]|5845909|576597|   39774|       170708|https://i.ytimg.c...|13.09|
|WYYvHb03Eog|Apple iPhone X fi...|           The Verge|         28|apple iph

In [81]:
videos.count()

7998

In [82]:
videos.drop_duplicates().count()

7997

In [83]:
videos = videos.drop_duplicates()

In [84]:
videos.select("video_id").drop_duplicates().count()

2364

In [85]:
comments_schema = StructType([ \
    StructField("video_id", StringType(), True), \
    StructField("comment_text", StringType(), True), \
    StructField("likes", IntegerType(), True), \
    StructField("replies", IntegerType(), True)])
comments = spark.read.option('header', 'true').option("mode", "DROPMALFORMED").schema(comments_schema).csv('../datasets/UScomments.csv')
comments.show()

+-----------+--------------------+-----+-------+
|   video_id|        comment_text|likes|replies|
+-----------+--------------------+-----+-------+
|XpVt6Z1Gjjo|Logan Paul it's y...|    4|      0|
|XpVt6Z1Gjjo|I've been followi...|    3|      0|
|XpVt6Z1Gjjo|Say hi to Kong an...|    3|      0|
|XpVt6Z1Gjjo| MY FAN . attendance|    3|      0|
|XpVt6Z1Gjjo|         trending 😉|    3|      0|
|XpVt6Z1Gjjo|#1 on trending AY...|    3|      0|
|XpVt6Z1Gjjo|The end though 😭...|    4|      0|
|XpVt6Z1Gjjo|#1 trending!!!!!!!!!|    3|      0|
|XpVt6Z1Gjjo|Happy one year vl...|    3|      0|
|XpVt6Z1Gjjo|You and your shit...|    0|      0|
|XpVt6Z1Gjjo|There should be a...|    0|      0|
|XpVt6Z1Gjjo|Dear Logan, I rea...|    0|      0|
|XpVt6Z1Gjjo|Honestly Evan is ...|    0|      0|
|XpVt6Z1Gjjo|Casey is still be...|    0|      0|
|XpVt6Z1Gjjo|aw geez rick this...|    0|      0|
|XpVt6Z1Gjjo|He happy cause he...|    0|      0|
|XpVt6Z1Gjjo|Ayyyyoooo Logang ...|    1|      0|
|XpVt6Z1Gjjo|Bro y did

In [86]:
DATA_PATH = "../datasets"

In [87]:
def rand(): 
    return random.randint(0, 3) 
    
rand_udf = udf(rand)
salt_df = spark.range(0, 4)


In [88]:
comments_aggregated = comments.groupBy('video_id').sum()


In [89]:
comments_aggregated.columns

['video_id', 'sum(likes)', 'sum(replies)']

In [90]:
videos_with_comments = videos.join(comments_aggregated, "video_id", "left")

# task 1

score = (1/100) * views + 10 * likes - 15 * dislikes + sqrt(comment_total) + 0.5 * sum(likes) + 3 * sum(replies)

In [91]:
scored_videos = videos_with_comments.fillna(0).withColumn('score', 0.01 * col('views') + 10 * col('likes') 
                           - 15 * col('dislikes') + sqrt(col('comment_total')) + 0.5 * col('sum(likes)')
                                 + 3 * col('sum(replies)'))

In [92]:
scored_videos.select('video_id', 'title', 'views', 'likes', 'dislikes', 'comment_total', 'score').show(10)

+-----------+--------------------+-------+------+--------+-------------+-------------------+
|   video_id|               title|  views| likes|dislikes|comment_total|              score|
+-----------+--------------------+-------+------+--------+-------------+-------------------+
|4yCkkOvIkUI|EXCLUSIVE: Zonniq...|   4937|    19|      21|           12|-42.165898384862246|
|4yCkkOvIkUI|EXCLUSIVE: Zonniq...|   5662|    33|      21|           13|   105.225551275464|
|4yCkkOvIkUI|EXCLUSIVE: Zonniq...|   2306|     7|       1|            0|             108.06|
|7TN09IP5JuI|Terry Crews Hallu...|3866789|168343|    1750|        24438| 1696021.2165812328|
|7TN09IP5JuI|Terry Crews Hallu...|4496301|182221|    1910|        25963| 1838701.1403819893|
|7TN09IP5JuI|Terry Crews Hallu...|2513012|130944|    1101|        19133| 1318210.4420878964|
|7TN09IP5JuI|Terry Crews Hallu...|5066207|196868|    2083|        22920| 1988265.4635269423|
|7TN09IP5JuI|Terry Crews Hallu...|4842696|190716|    2011|        2224

In [93]:
scored_videos_agg = scored_videos.groupBy('video_id', 'title').sum('score')

In [95]:
scored_videos_agg.show(10)

+-----------+--------------------+------------------+
|   video_id|               title|        sum(score)|
+-----------+--------------------+------------------+
|4yCkkOvIkUI|EXCLUSIVE: Zonniq...|171.11965289060174|
|7TN09IP5JuI|Terry Crews Hallu...|   8766786.3768606|
|Bo-qp-Zu0OY|Meeting Talking D...|2056.5172255750517|
|JkqTeQHFoBY|Guardians of the ...| 53506.11910356655|
|K7pQsR8WFSo|Schlieren Imaging...|1113704.7444387542|
|RE-far-FvRs|PUPPIES FIRST BAT...| 629923.5713838206|
|TzyraAp3jaY|Martin Scorsese T...|31533.503134435457|
|WQjO1mMCPg4|EVERY MCDONALD'S ...| 2730754.823610126|
|_r5eTelhpmQ|Darius Rucker - L...| 79841.71751014146|
|aRgTLb5EbiQ| My Mama Wears Timbs| 94809.52876149536|
+-----------+--------------------+------------------+
only showing top 10 rows



In [96]:
scored_videos_agg.toPandas().to_csv(f'{DATA_PATH}/scored_videos.csv')

# task 2

In [97]:
@pandas_udf("double", PandasUDFType.GROUPED_AGG)
def median_udf(v):
    return v.median()



In [98]:
category_id = pd.read_json(f'{DATA_PATH}/US_category_id.json')


In [99]:
categories = [(i['id'], i['snippet']['title']) for i in category_id['items'].to_list()]

In [100]:
columns = ["category_id","title"]
category_df = spark.createDataFrame(data=categories, schema = columns)

In [102]:
categories_score = scored_videos.groupBy('category_id').agg(median_udf(scored_videos['score']))

In [103]:
categories_score = categories_score.join(category_df, "category_id", "left")

In [104]:
categories_score.show()

+-----------+------------------+--------------------+
|category_id| median_udf(score)|               title|
+-----------+------------------+--------------------+
|         28|102903.86358260454|Science & Technology|
|         26|126200.46018588994|       Howto & Style|
|         27| 79998.44713047001|           Education|
|         22|101496.21589471283|      People & Blogs|
|          1| 66313.44580448601|    Film & Animation|
|         20| 40213.25209422822|              Gaming|
|         19| 82802.32745776343|     Travel & Events|
|         15| 76319.85694380733|      Pets & Animals|
|         43|1195.0077625302981|               Shows|
|         17|16948.421900349353|              Sports|
|         23|252882.44692637963|              Comedy|
|         10|107185.15181449405|               Music|
|         25| 6684.407448713916|     News & Politics|
|         24| 85696.83696125855|       Entertainment|
|         29| 5696.682886552931|Nonprofits & Acti...|
|          2| 13876.75154961

In [105]:
categories_score.toPandas().to_csv(f'{DATA_PATH}/categories_score.csv')

# task 3

In [107]:
videos.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_total: integer (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- date: string (nullable = true)



In [111]:
import timeit

@pandas_udf("array<string>", PandasUDFType.SCALAR)
def split_udf(v):
    return v.str.split("|")

In [113]:
%timeit for x in range(100): split_udf(col("tags"))

407 ms ± 43.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [115]:
videos.select(split_udf(col("tags"))).show()

+--------------------+
|     split_udf(tags)|
+--------------------+
|[America's Got Ta...|
|[alien, alien cov...|
|            [[none]]|
|[mexico, earthqua...|
|[adventure, adven...|
|[Access Hollywood...|
|[penn state nitta...|
|[taking everythin...|
|[administracja, l...|
|[fox, fox sports,...|
|[What What Happen...|
|[kungs, more mess...|
|[iphone x by pine...|
|[jeffree star, ve...|
|[iphone 8, iphone...|
|[fleurdeforce, fl...|
|[cars, climbkhana...|
|[Shay Mitchell, S...|
|[a24, a24 films, ...|
|[Shania Twain, Sh...|
+--------------------+
only showing top 20 rows



In [120]:
from pyspark.sql.column import _to_java_column
from pyspark.sql.column import _to_seq

sc = spark.sparkContext

def splitTagsUDFWrapper(row):
    _ipToIntUDF = sc._jvm.CustomUDFs.splitTagsUDF()
    return Column(_ipToIntUDF.apply(_to_seq(sc, [row], _to_java_column)))

In [123]:
%timeit for x in range(100): splitTagsUDFWrapper(col("tags"))

447 ms ± 38.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [124]:
map = videos.select(split_udf(col("tags"))).rdd.flatMap(lambda x: 
                                                        x).flatMap(lambda x: x).countByValue().items()

In [125]:
popular_tags = spark.createDataFrame(data=map, schema = ["tag", "count"])
popular_tags.printSchema()
popular_tags.show()

root
 |-- tag: string (nullable = true)
 |-- count: long (nullable = true)

+--------------------+-----+
|                 tag|count|
+--------------------+-----+
|America's Got Tal...|   13|
|america's got talent|   16|
|america's got tal...|   13|
|america's got tal...|   13|
|america's got tal...|   13|
|                 AGT|   13|
|  AGT 2017 auditions|   13|
|  AGT best auditions|   13|
|                 NBC|  118|
|                  TV|   49|
|            TV Shows|   13|
|          Highlights|   21|
|            Previews|   13|
|        Simon Cowell|   13|
|        Howie Mandel|   13|
|          Tyra Banks|   13|
|          Heidi Klum|   14|
|               Mel B|   13|
|           season 12|   50|
|           America's|   13|
+--------------------+-----+
only showing top 20 rows



In [126]:
popular_tags.toPandas().to_csv(f'{DATA_PATH}/popular_tags.csv')

# task 4

In [127]:
cat_tags = (lower(col('tags')).startswith('cat|') | lower(col('tags')).contains('|cat|') 
           | lower(col('tags')).endswith('|cat'))


has_comments = (col('comment_total')>0)

cat_video = videos.filter(cat_tags & has_comments)

In [128]:
cat_video.select('tags').show(10)

+--------------------+
|                tags|
+--------------------+
|cats|cat|kittens|...|
|cat|dog|cute|gami...|
|cartoon|simons ca...|
|cartoon|simons ca...|
|colleen ballinger...|
|Husky's First How...|
|cat|dog|cute|gami...|
|cartoon|simons ca...|
|cartoon|simons ca...|
|cat|dog|cute|gami...|
+--------------------+
only showing top 10 rows



In [129]:
salted_comments = comments.withColumn("salted_video_id", concat("video_id", lit("_"), lit(rand_udf())))

In [130]:
salted_comments.show(10)

+-----------+--------------------+-----+-------+---------------+
|   video_id|        comment_text|likes|replies|salted_video_id|
+-----------+--------------------+-----+-------+---------------+
|XpVt6Z1Gjjo|Logan Paul it's y...|    4|      0|  XpVt6Z1Gjjo_2|
|XpVt6Z1Gjjo|I've been followi...|    3|      0|  XpVt6Z1Gjjo_1|
|XpVt6Z1Gjjo|Say hi to Kong an...|    3|      0|  XpVt6Z1Gjjo_3|
|XpVt6Z1Gjjo| MY FAN . attendance|    3|      0|  XpVt6Z1Gjjo_3|
|XpVt6Z1Gjjo|         trending 😉|    3|      0|  XpVt6Z1Gjjo_1|
|XpVt6Z1Gjjo|#1 on trending AY...|    3|      0|  XpVt6Z1Gjjo_3|
|XpVt6Z1Gjjo|The end though 😭...|    4|      0|  XpVt6Z1Gjjo_3|
|XpVt6Z1Gjjo|#1 trending!!!!!!!!!|    3|      0|  XpVt6Z1Gjjo_3|
|XpVt6Z1Gjjo|Happy one year vl...|    3|      0|  XpVt6Z1Gjjo_0|
|XpVt6Z1Gjjo|You and your shit...|    0|      0|  XpVt6Z1Gjjo_3|
+-----------+--------------------+-----+-------+---------------+
only showing top 10 rows



In [131]:
salted_cat_video = cat_video.join(salt_df, 
                            how="cross").withColumn("salted_video_id", concat("video_id", lit("_"), "id")).drop("id")

salted_joined_df = salted_cat_video.join(salted_comments.withColumnRenamed( 'likes', 'comment_likes').drop('video_id'), on="salted_video_id", how="inner")


In [132]:
salted_joined_df.orderBy(col('comment_likes').desc()).show(20)

+---------------+-----------+--------------------+-------------+-----------+--------------------+------+-----+--------+-------------+--------------------+-----+--------------------+-------------+-------+
|salted_video_id|   video_id|               title|channel_title|category_id|                tags| views|likes|dislikes|comment_total|      thumbnail_link| date|        comment_text|comment_likes|replies|
+---------------+-----------+--------------------+-------------+-----------+--------------------+------+-----+--------+-------------+--------------------+-----+--------------------+-------------+-------+
|  xbBMVa2A68s_3|xbBMVa2A68s|Cat vs Dog - Best...|      TierZoo|         20|cat|dog|cute|gami...|320175|19593|     608|         4904|https://i.ytimg.c...|17.10|The second I read...|         2355|     15|
|  xbBMVa2A68s_3|xbBMVa2A68s|Cat vs Dog - Best...|      TierZoo|         20|cat|dog|cute|gami...|370320|21325|     694|         5356|https://i.ytimg.c...|20.10|The second I read...|   

In [133]:
salted_joined_df.orderBy(col('comment_likes').desc()).select('comment_total', 'thumbnail_link').show(20)

+-------------+--------------------+
|comment_total|      thumbnail_link|
+-------------+--------------------+
|         5202|https://i.ytimg.c...|
|         5293|https://i.ytimg.c...|
|         5356|https://i.ytimg.c...|
|         4904|https://i.ytimg.c...|
|         5412|https://i.ytimg.c...|
|         4904|https://i.ytimg.c...|
|         5356|https://i.ytimg.c...|
|         5202|https://i.ytimg.c...|
|         5412|https://i.ytimg.c...|
|         5293|https://i.ytimg.c...|
|         4904|https://i.ytimg.c...|
|         5356|https://i.ytimg.c...|
|         5202|https://i.ytimg.c...|
|         5412|https://i.ytimg.c...|
|         5293|https://i.ytimg.c...|
|         4904|https://i.ytimg.c...|
|         5356|https://i.ytimg.c...|
|         5202|https://i.ytimg.c...|
|         5412|https://i.ytimg.c...|
|         5293|https://i.ytimg.c...|
+-------------+--------------------+
only showing top 20 rows



В данных есть полудубликаты, которые все же различаются по полям thumbnail_link и comment_total, так что финального результата дропаю их

In [134]:
salted_joined_df.drop_duplicates(['comment_text']).orderBy(col('comment_likes').desc()).select('video_id', 'comment_text', 'comment_likes', 'replies').show(5)

+-----------+--------------------+-------------+-------+
|   video_id|        comment_text|comment_likes|replies|
+-----------+--------------------+-------------+-------+
|xbBMVa2A68s|The second I read...|         2355|     15|
|-1fzGnFwz9M|I make interestin...|          839|      5|
|xbBMVa2A68s|talk about the oc...|          802|     27|
|tp9aQXDFHbY|Make sure to chec...|          304|     38|
|tp9aQXDFHbY|If Simon will be ...|           37|      1|
+-----------+--------------------+-------------+-------+
only showing top 5 rows

