In [1]:
from pyspark.sql import SparkSession

# stop the SparkSession created automatically (by the time the notebook is running, cannot change much in that session's configuration)
# [Ref]https://www.edureka.co/community/5268/how-to-change-the-spark-session-configuration-in-pyspark
spark.sparkContext.stop() 
# create a new SparkSession and connect to MongoDB database & collection
spark = SparkSession \
    .builder \
    .appName("YouTube Trending Videos Analysis and Prediction") \
    .config("spark.mongodb.input.uri", "mongodb+srv://gp15:MSBD5003gp15@cluster0.qfnff.mongodb.net/Database0.US_pre") \
    .getOrCreate()

spark # check if sparksession created successfully



In [2]:
df = spark.read.format("mongo").load()

In [3]:
df1 = df.drop("_id")
df1.show()

+-----------+----------------+--------------------+-------------+--------+------+--------------------+--------------------+--------------------+-------------+--------+
|category_id|  category_title|       channel_title|comment_count|dislikes| likes|        publish_time|                tags|               title|trending_date|   views|
+-----------+----------------+--------------------+-------------+--------+------+--------------------+--------------------+--------------------+-------------+--------+
|          1|Film & Animation|    Vote The Process|           52|      27|   593|2017-11-30T01:29:...|iMovie|"Cleveland...|Josh Gordon - I'm...|     17.05.12|   88657|
|          1|Film & Animation|          Arden Rose|          108|      77|  3424|2017-12-02T18:14:...|day in my life|"s...|A VERY SPECIAL DA...|     17.08.12|   75279|
|          1|Film & Animation|          Arden Rose|          314|     248|  8342|2018-02-12T18:31:...|love|"advice"|"da...|The Best Love Adv...|     18.17.02|  

In [4]:
df2 = df1.select('category_id','tags','views').withColumnRenamed("views","label")
df2.cache()

DataFrame[category_id: int, tags: string, label: int]

In [8]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import *
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.ml.evaluation import RegressionEvaluator

In [9]:
tokenizer = RegexTokenizer(inputCol="tags", outputCol="words", pattern="\|")
tokenized = tokenizer.transform(df2)
tokenized.cache()

DataFrame[category_id: int, tags: string, label: int, words: array<string>]

In [10]:
word = tokenized.withColumn("exp", F.explode('words')).select('exp')

In [11]:
fq = word.groupby('exp').count()
fq = fq.filter(col('count') > 10).collect()
fq = [i['exp'] for i in fq]

In [12]:
final = tokenized.rdd.map(lambda x: (x.category_id, [i for i in x.words if i in fq], x.label))
final = spark.createDataFrame(final).toDF('id', 'words','label')
final = final.rdd.map(lambda x: (x.id, x.words if (len(x.words)>0) else ['[none]'] , x.label))
final = spark.createDataFrame(final).toDF('id', 'words','label')

In [13]:
final = final.withColumn('id_list', F.array(final.id))
final = final.withColumn('merged', concat(final.id_list, final.words))

In [14]:
hashingTF = HashingTF(inputCol='merged', outputCol="features", numFeatures = 2048)
hashed = hashingTF.transform(final)
hashed.show()

+---+--------------------+--------+-------+--------------------+--------------------+
| id|               words|   label|id_list|              merged|            features|
+---+--------------------+--------+-------+--------------------+--------------------+
|  1|["weed", "addicti...|   88657|    [1]|[1, "weed", "addi...|(2048,[491,615,74...|
|  1|["vlog", "vloggin...|   75279|    [1]|[1, "vlog", "vlog...|(2048,[321,491,51...|
|  1|[love, "advice", ...|  131016|    [1]|[1, love, "advice...|(2048,[112,382,49...|
|  1|["lionsgate", "ky...| 7138640|    [1]|[1, "lionsgate", ...|(2048,[149,267,29...|
|  1|["bruce willis", ...| 3652424|    [1]|[1, "bruce willis...|(2048,[491,524,68...|
|  1|["stephen colbert...| 1609627|    [1]|[1, "stephen colb...|(2048,[120,139,34...|
|  1|["uncle drew", "l...| 7891175|    [1]|[1, "uncle drew",...|(2048,[149,183,26...|
|  1|            [[none]]| 1494486|    [1]|         [1, [none]]|(2048,[465,491],[...|
|  1|[pacific rim, "pa...| 5335815|    [1]|[1, pacific

In [15]:
trainingData, testData = hashed.randomSplit([0.8, 0.2])

rf = RandomForestRegressor(featuresCol="features", maxDepth=8)

model = rf.fit(trainingData)

predictionsDf = model.transform(testData)
predictionsDf.show()

+---+--------------------+-------+-------+--------------------+--------------------+----------------+
| id|               words|  label|id_list|              merged|            features|      prediction|
+---+--------------------+-------+-------+--------------------+--------------------+----------------+
|  1|["3d", "vfx", "fx...|  24543|    [1]|[1, "3d", "vfx", ...|(2048,[71,221,222...|1440458.71157173|
|  1|["a star wars sto...|1752481|    [1]|[1, "a star wars ...|(2048,[189,254,49...|1440458.71157173|
|  1|["amy adams", "je...|1054285|    [1]|[1, "amy adams", ...|(2048,[189,254,47...|1440458.71157173|
|  1|["amy adams", "je...|1760535|    [1]|[1, "amy adams", ...|(2048,[189,254,47...|1440458.71157173|
|  1|["animation", "ro...|  25096|    [1]|[1, "animation", ...|(2048,[491,657,70...|1440458.71157173|
|  1|["animation", "ro...|  25961|    [1]|[1, "animation", ...|(2048,[491,657,70...|1440458.71157173|
|  1|["animation", "st...|  61240|    [1]|[1, "animation", ...|(2048,[491,657,72..

In [16]:
rf_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label",metricName="r2")
rf_evaluator.evaluate(predictionsDf)

0.5650155304128361