In [42]:
import pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

import pandas as pd
import random


In [94]:
spark = SparkSession.builder.master("local") \
    .config('spark.sql.autoBroadcastJoinThreshold', 0) \
    .config('spark.sql.adaptive.enabled', 'false') \
    .getOrCreate()

In [95]:
videos = spark.read.option('header', 'true').option("inferSchema", "true").csv('../datasets/USvideos.csv')
videos.show()

+-----------+--------------------+--------------------+-----------+--------------------+-------+------+--------+-------------+--------------------+-----+
|   video_id|               title|       channel_title|category_id|                tags|  views| likes|dislikes|comment_total|      thumbnail_link| date|
+-----------+--------------------+--------------------+-----------+--------------------+-------+------+--------+-------------+--------------------+-----+
|XpVt6Z1Gjjo|1 YEAR OF VLOGGIN...|    Logan Paul Vlogs|         24|logan paul vlog|l...|4394029|320053|    5931|        46245|https://i.ytimg.c...|13.09|
|K4wEI5zhHB0|iPhone X — Introd...|               Apple|         28|Apple|iPhone 10|i...|7860119|185853|   26679|            0|https://i.ytimg.c...|13.09|
|cLdxuaxaQwc|         My Response|           PewDiePie|         22|              [none]|5845909|576597|   39774|       170708|https://i.ytimg.c...|13.09|
|WYYvHb03Eog|Apple iPhone X fi...|           The Verge|         28|apple iph

In [96]:
comments_schema = StructType([ \
    StructField("video_id", StringType(), True), \
    StructField("comment_text", StringType(), True), \
    StructField("likes", IntegerType(), True), \
    StructField("replies", IntegerType(), True)])
comments = spark.read.option('header', 'true').option("mode", "DROPMALFORMED").schema(comments_schema).csv('../datasets/UScomments.csv')
comments.show()

+-----------+--------------------+-----+-------+
|   video_id|        comment_text|likes|replies|
+-----------+--------------------+-----+-------+
|XpVt6Z1Gjjo|Logan Paul it's y...|    4|      0|
|XpVt6Z1Gjjo|I've been followi...|    3|      0|
|XpVt6Z1Gjjo|Say hi to Kong an...|    3|      0|
|XpVt6Z1Gjjo| MY FAN . attendance|    3|      0|
|XpVt6Z1Gjjo|         trending 😉|    3|      0|
|XpVt6Z1Gjjo|#1 on trending AY...|    3|      0|
|XpVt6Z1Gjjo|The end though 😭...|    4|      0|
|XpVt6Z1Gjjo|#1 trending!!!!!!!!!|    3|      0|
|XpVt6Z1Gjjo|Happy one year vl...|    3|      0|
|XpVt6Z1Gjjo|You and your shit...|    0|      0|
|XpVt6Z1Gjjo|There should be a...|    0|      0|
|XpVt6Z1Gjjo|Dear Logan, I rea...|    0|      0|
|XpVt6Z1Gjjo|Honestly Evan is ...|    0|      0|
|XpVt6Z1Gjjo|Casey is still be...|    0|      0|
|XpVt6Z1Gjjo|aw geez rick this...|    0|      0|
|XpVt6Z1Gjjo|He happy cause he...|    0|      0|
|XpVt6Z1Gjjo|Ayyyyoooo Logang ...|    1|      0|
|XpVt6Z1Gjjo|Bro y did

In [97]:
DATA_PATH = "../datasets"

# task 1

In [98]:
videos.columns

['video_id',
 'title',
 'channel_title',
 'category_id',
 'tags',
 'views',
 'likes',
 'dislikes',
 'comment_total',
 'thumbnail_link',
 'date']

score = (1/100) * views + 10 * likes - 15 * dislikes + sqrt(comment_total)

In [99]:
scored_videos = videos.withColumn('score', 0.01 * col('views') + 10 * col('likes') 
                           - 15 * col('dislikes') + sqrt(col('comment_total')))

In [100]:
scored_videos.select('video_id', 'title', 'views', 'likes', 'dislikes', 'comment_total', 'score').show(10)

+-----------+--------------------+-------+------+--------+-------------+------------------+
|   video_id|               title|  views| likes|dislikes|comment_total|             score|
+-----------+--------------------+-------+------+--------+-------------+------------------+
|XpVt6Z1Gjjo|1 YEAR OF VLOGGIN...|4394029|320053|    5931|        46245| 3155720.336506598|
|K4wEI5zhHB0|iPhone X — Introd...|7860119|185853|   26679|            0|        1536946.19|
|cLdxuaxaQwc|         My Response|5845909|576597|   39774|       170708| 5228232.258246602|
|WYYvHb03Eog|Apple iPhone X fi...|2642103| 24975|    4542|        12829| 208154.2951755837|
|sjlHnJvXdQs|   iPhone X (parody)|1168130| 96666|     568|         6666| 969902.9455755078|
|cMKX2tE5Luk|The Disaster Arti...|1311445| 34507|     544|         3040|350079.58619500836|
|8wNr-NQImFg|The Check In: HUD...| 666169|  9985|     297|         1071| 102089.4161363439|
|_HTXMhKWqnA|iPhone X Impressi...|1728614| 74062|    2180|        15297| 725329.

In [101]:
scored_videos.toPandas().to_csv(f'{DATA_PATH}/scored_videos.csv')

# task 2

In [102]:
@pandas_udf("double", PandasUDFType.GROUPED_AGG)
def median_udf(v):
    return v.median()



In [105]:
categories_score = scored_videos.groupBy('category_id').agg(median_udf(scored_videos['score']))

In [106]:
categories_score.show()

+-----------+------------------+
|category_id| median_udf(score)|
+-----------+------------------+
|         28|102567.77283471046|
|         26|126031.98011930188|
|         27| 79894.36059730663|
|         22|101447.21589471283|
|          1|  65593.8399499765|
|         20| 39963.75209422822|
|         19| 82743.32745776343|
|         15| 74935.35694380733|
|         43|1088.5077625302981|
|         17| 16810.32820955084|
|         23|252648.44692637963|
|         10|106786.23916534703|
|         25| 5973.396500846582|
|         24|  84878.1513075057|
|         29| 5566.432886552931|
|          2| 13116.75154961165|
+-----------+------------------+



In [107]:
categories_score.toPandas().to_csv(f'{DATA_PATH}/categories_score.csv')

не id, он непонятный для аналитиков! -- я не совсем понял, чего хотелось вместо id здесь, так что оставил id

# task 3

In [108]:
videos.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_total: integer (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- date: string (nullable = true)



In [109]:
@pandas_udf("array<string>", PandasUDFType.SCALAR)
def split_udf(v):
    return v.str.split("|")



In [110]:
videos.select(split_udf(col("tags"))).show()

+--------------------+
|     split_udf(tags)|
+--------------------+
|[logan paul vlog,...|
|[Apple, iPhone 10...|
|            [[none]]|
|[apple iphone x h...|
|[jacksfilms, paro...|
|[a24, a24 films, ...|
|[Late night, Seth...|
|[iPhone X, iphone...|
|[Roman Atwood, Ro...|
|[screenjunkies, s...|
|[Collegehumor, CH...|
|[best floyd maywe...|
|[The Tonight Show...|
|[mtv, video, onli...|
|[America's Got Ta...|
|[Apple, iPhone X,...|
|[panda, what shou...|
|[skit, korean, la...|
|[how to, cooking,...|
|[downsizing, prev...|
+--------------------+
only showing top 20 rows



In [111]:
videos.select(splitTagsUDF(col("tags"))).show()

NameError: name 'splitTagsUDF' is not defined

In [112]:
map = videos.select(split_udf(col("tags"))).rdd.flatMap(lambda x: 
                                                        x).flatMap(lambda x: x).countByValue().items()

In [113]:
popular_tags = spark.createDataFrame(data=map, schema = ["tag", "count"])
popular_tags.printSchema()
popular_tags.show()

root
 |-- tag: string (nullable = true)
 |-- count: long (nullable = true)

+--------------------+-----+
|                 tag|count|
+--------------------+-----+
|     logan paul vlog|   13|
|          logan paul|   29|
|               logan|   29|
|                paul|   25|
|            olympics|   14|
|  logan paul youtube|   13|
|                vlog|  273|
|               daily|   73|
|              comedy|  572|
|           hollywood|  150|
|              parrot|   13|
|            maverick|   18|
|                bird|   13|
|    maverick clothes|   13|
| diamond play button|    7|
|logan paul diamon...|    7|
|     10M subscribers|    7|
|logan paul 1 year...|    7|
|         1 year vlog|    7|
|dwarf mamba play ...|    7|
+--------------------+-----+
only showing top 20 rows



In [114]:
popular_tags.toPandas().to_csv(f'{DATA_PATH}/popular_tags.csv')

# task 4

In [74]:
cat_tags = (lower(col('tags')).startswith('cat|') | lower(col('tags')).contains('|cat|') 
           | lower(col('tags')).endswith('|cat'))


has_comments = (col('comment_total')>0)

cat_video = videos.filter(cat_tags & has_comments)

In [75]:
cat_video.select('tags').show(10)

+--------------------+
|                tags|
+--------------------+
|Maru|cat|kitty|pe...|
|SciShow|science|H...|
|cartoon|simons ca...|
|Maru|cat|kitty|pe...|
|SciShow|science|H...|
|Maru|cat|kitty|pe...|
|Maru|cat|kitty|pe...|
|cartoon|simons ca...|
|Husky's First How...|
|cartoon|simons ca...|
+--------------------+
only showing top 10 rows



In [76]:
def rand(): 
    return random.randint(0, 3) 
    
rand_udf = udf(rand)

salt_df = spark.range(0, 4)

salted_comments = comments.withColumn("salted_video_id", concat("video_id", lit("_"), lit(rand_udf())))

In [77]:
salted_comments.show(10)

+-----------+--------------------+-----+-------+---------------+
|   video_id|        comment_text|likes|replies|salted_video_id|
+-----------+--------------------+-----+-------+---------------+
|XpVt6Z1Gjjo|Logan Paul it's y...|    4|      0|  XpVt6Z1Gjjo_3|
|XpVt6Z1Gjjo|I've been followi...|    3|      0|  XpVt6Z1Gjjo_2|
|XpVt6Z1Gjjo|Say hi to Kong an...|    3|      0|  XpVt6Z1Gjjo_0|
|XpVt6Z1Gjjo| MY FAN . attendance|    3|      0|  XpVt6Z1Gjjo_2|
|XpVt6Z1Gjjo|         trending 😉|    3|      0|  XpVt6Z1Gjjo_0|
|XpVt6Z1Gjjo|#1 on trending AY...|    3|      0|  XpVt6Z1Gjjo_3|
|XpVt6Z1Gjjo|The end though 😭...|    4|      0|  XpVt6Z1Gjjo_2|
|XpVt6Z1Gjjo|#1 trending!!!!!!!!!|    3|      0|  XpVt6Z1Gjjo_3|
|XpVt6Z1Gjjo|Happy one year vl...|    3|      0|  XpVt6Z1Gjjo_1|
|XpVt6Z1Gjjo|You and your shit...|    0|      0|  XpVt6Z1Gjjo_0|
+-----------+--------------------+-----+-------+---------------+
only showing top 10 rows



In [78]:
salted_cat_video = cat_video.join(salt_df, 
                            how="cross").withColumn("salted_video_id", concat("video_id", lit("_"), "id")).drop("id")


In [83]:
salted_joined_df = salted_cat_video.join(salted_comments.withColumnRenamed( 'likes', 'comment_likes').drop('video_id'), on="salted_video_id", how="inner")


In [88]:
salted_joined_df.orderBy(col('comment_likes').desc()).show(20)

+---------------+-----------+--------------------+-------------+-----------+--------------------+------+-----+--------+-------------+--------------------+-----+--------------------+-------------+-------+
|salted_video_id|   video_id|               title|channel_title|category_id|                tags| views|likes|dislikes|comment_total|      thumbnail_link| date|        comment_text|comment_likes|replies|
+---------------+-----------+--------------------+-------------+-----------+--------------------+------+-----+--------+-------------+--------------------+-----+--------------------+-------------+-------+
|  xbBMVa2A68s_0|xbBMVa2A68s|Cat vs Dog - Best...|      TierZoo|         20|cat|dog|cute|gami...|320175|19593|     608|         4904|https://i.ytimg.c...|17.10|The second I read...|         2355|     15|
|  xbBMVa2A68s_0|xbBMVa2A68s|Cat vs Dog - Best...|      TierZoo|         20|cat|dog|cute|gami...|356211|20865|     680|         5202|https://i.ytimg.c...|18.10|The second I read...|   

In [90]:
salted_joined_df.orderBy(col('comment_likes').desc()).select('comment_total', 'thumbnail_link').show(20)

+-------------+--------------------+
|comment_total|      thumbnail_link|
+-------------+--------------------+
|         4904|https://i.ytimg.c...|
|         5202|https://i.ytimg.c...|
|         5293|https://i.ytimg.c...|
|         5356|https://i.ytimg.c...|
|         5412|https://i.ytimg.c...|
|         4904|https://i.ytimg.c...|
|         5202|https://i.ytimg.c...|
|         5293|https://i.ytimg.c...|
|         5356|https://i.ytimg.c...|
|         5412|https://i.ytimg.c...|
|         4904|https://i.ytimg.c...|
|         5202|https://i.ytimg.c...|
|         5293|https://i.ytimg.c...|
|         5356|https://i.ytimg.c...|
|         5412|https://i.ytimg.c...|
|         4904|https://i.ytimg.c...|
|         5202|https://i.ytimg.c...|
|         5293|https://i.ytimg.c...|
|         5356|https://i.ytimg.c...|
|         5412|https://i.ytimg.c...|
+-------------+--------------------+
only showing top 20 rows



В данных есть полудубликаты, которые все же различаются по полям thumbnail_link и comment_total, так что финального результата дропаю их

In [93]:
salted_joined_df.drop_duplicates(['comment_text']).orderBy(col('comment_likes').desc()).select('video_id', 'comment_text', 'comment_likes', 'replies').show(5)

+-----------+--------------------+-------------+-------+
|   video_id|        comment_text|comment_likes|replies|
+-----------+--------------------+-------------+-------+
|xbBMVa2A68s|The second I read...|         2355|     15|
|-1fzGnFwz9M|I make interestin...|          839|      5|
|tp9aQXDFHbY|Make sure to chec...|          194|     22|
|xbBMVa2A68s|talk about the oc...|          118|     11|
|tp9aQXDFHbY|1:51 so your nuts...|          100|      5|
+-----------+--------------------+-------------+-------+
only showing top 5 rows

