In [1]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
vids = spark.read.csv('./US_pre.csv', inferSchema=True, header='True')

In [9]:
vids.printSchema()

root
 |-- category_id: integer (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- publish_time: timestamp (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- category_title: string (nullable = true)



In [10]:
vids.show()

+-----------+-------------+--------------------+------------------+-------------------+--------------------+-------+------+--------+-------------+--------------------+
|category_id|trending_date|               title|     channel_title|       publish_time|                tags|  views| likes|dislikes|comment_count|      category_title|
+-----------+-------------+--------------------+------------------+-------------------+--------------------+-------+------+--------+-------------+--------------------+
|         10|     18.30.01|DJ Khaled, Rihann...|      Super Netvid|2018-01-29 11:11:56|              [none]|3014847| 42556|    2791|            0|               Music|
|         22|     18.03.01|The Queen's Chris...|  The Royal Family|2017-12-25 23:02:43|              [none]|2497549| 51959|    6265|            0|      People & Blogs|
|         24|     17.10.12|Little Mix bring ...|   The X Factor UK|2017-12-04 03:47:01|"the x factor"|"x...|5275288|169795|    3706|            0|       Enterta

In [3]:
# Drop columns we are not using.
df = vids.select('category_id','tags','views').withColumnRenamed("views","label")
df.cache()
df.show()

+-----------+--------------------+-------+
|category_id|                tags|  label|
+-----------+--------------------+-------+
|         10|              [none]|3014847|
|         22|              [none]|2497549|
|         24|"the x factor"|"x...|5275288|
|         27|              [none]|  13630|
|         25|"CBS 4 News Morni...|   1095|
|         17|              [none]|   7288|
|         17|"Philadelphia"|"7...|  14609|
|         23|"barstool sports"...|  20063|
|          1|"iMovie"|"Clevela...|  88657|
|         24|"ana patricia gam...|  16976|
|         24|"car chase"|"los ...|  41471|
|         26|"travel"|"stunt"|...|  18246|
|         26|"travel smarter"|...|  41265|
|         10|"drake"|"21 savag...|  19722|
|          1|"day in my life"|...|  75279|
|         10|"Ciara"|"cici"|"c...|  27765|
|         17|"nba"|"basketball...| 117586|
|         26|"refinery29"|"ref...|  50538|
|         24|"jimmy"|"kimmel"|...|  82052|
|         28|"SpaceX"|"Test Fi...| 147899|
+----------

In [4]:
tokenizer = RegexTokenizer(inputCol="tags", outputCol="words", pattern="\|")
tokenized = tokenizer.transform(df)
tokenized.cache()
tokenized.show()

+-----------+--------------------+-------+--------------------+
|category_id|                tags|  label|               words|
+-----------+--------------------+-------+--------------------+
|         10|              [none]|3014847|            [[none]]|
|         22|              [none]|2497549|            [[none]]|
|         24|"the x factor"|"x...|5275288|["the x factor", ...|
|         27|              [none]|  13630|            [[none]]|
|         25|"CBS 4 News Morni...|   1095|["cbs 4 news morn...|
|         17|              [none]|   7288|            [[none]]|
|         17|"Philadelphia"|"7...|  14609|["philadelphia", ...|
|         23|"barstool sports"...|  20063|["barstool sports...|
|          1|"iMovie"|"Clevela...|  88657|["imovie", "cleve...|
|         24|"ana patricia gam...|  16976|["ana patricia ga...|
|         24|"car chase"|"los ...|  41471|["car chase", "lo...|
|         26|"travel"|"stunt"|...|  18246|["travel", "stunt...|
|         26|"travel smarter"|...|  4126

In [5]:
word = tokenized.withColumn("exp", F.explode('words')).select('exp')
word.show()

+--------------------+
|                 exp|
+--------------------+
|              [none]|
|              [none]|
|      "the x factor"|
|          "x factor"|
|       "x factor uk"|
|     "x factor 2017"|
|      "simon cowell"|
|            "nicole"|
|            "sharon"|
|             "louis"|
|            "talent"|
|         "auditions"|
|            "judges"|
|         "season 14"|
|         "series 14"|
|  "x factor uk 2017"|
|"x factor 2017 au...|
| "the x factor 2017"|
|      "xfactor 2017"|
|               "itv"|
+--------------------+
only showing top 20 rows



In [6]:
fq = word.groupby('exp').count()
fq = fq.filter(col('count') > 10).collect()
fq = [i['exp'] for i in fq]
print(fq)



In [7]:
final = tokenized.rdd.map(lambda x: (x.category_id, [i for i in x.words if i in fq], x.label))
final = spark.createDataFrame(final).toDF('id', 'words','label')
final = final.rdd.map(lambda x: (x.id, x.words if (len(x.words)>0) else ['[none]'] , x.label))
final = spark.createDataFrame(final).toDF('id', 'words','label')
final.show()

+---+--------------------+-------+
| id|               words|  label|
+---+--------------------+-------+
| 10|            [[none]]|3014847|
| 22|            [[none]]|2497549|
| 24|["the x factor", ...|5275288|
| 27|            [[none]]|  13630|
| 25|            [[none]]|   1095|
| 17|            [[none]]|   7288|
| 17|["philadelphia", ...|  14609|
| 23|["sports news", "...|  20063|
|  1|["weed", "addicti...|  88657|
| 24|["gerard butler",...|  16976|
| 24|["los angeles", "...|  41471|
| 26|["travel", "stunt...|  18246|
| 26|["travel smarter"...|  41265|
| 10|["drake", "21 sav...|  19722|
|  1|["vlog", "vloggin...|  75279|
| 10|["ciara", "christ...|  27765|
| 17|["nba", "basketba...| 117586|
| 26|["refinery29", "r...|  50538|
| 24|["jimmy", "kimmel...|  82052|
| 28|["spacex", "nasa"...| 147899|
+---+--------------------+-------+
only showing top 20 rows



In [8]:
final = final.withColumn('id_list', F.array(final.id))
final = final.withColumn('merged', concat(final.id_list, final.words))
final.show()

+---+--------------------+-------+-------+--------------------+
| id|               words|  label|id_list|              merged|
+---+--------------------+-------+-------+--------------------+
| 10|            [[none]]|3014847|   [10]|        [10, [none]]|
| 22|            [[none]]|2497549|   [22]|        [22, [none]]|
| 24|["the x factor", ...|5275288|   [24]|[24, "the x facto...|
| 27|            [[none]]|  13630|   [27]|        [27, [none]]|
| 25|            [[none]]|   1095|   [25]|        [25, [none]]|
| 17|            [[none]]|   7288|   [17]|        [17, [none]]|
| 17|["philadelphia", ...|  14609|   [17]|[17, "philadelphi...|
| 23|["sports news", "...|  20063|   [23]|[23, "sports news...|
|  1|["weed", "addicti...|  88657|    [1]|[1, "weed", "addi...|
| 24|["gerard butler",...|  16976|   [24]|[24, "gerard butl...|
| 24|["los angeles", "...|  41471|   [24]|[24, "los angeles...|
| 26|["travel", "stunt...|  18246|   [26]|[26, "travel", "s...|
| 26|["travel smarter"...|  41265|   [26

In [9]:
hashingTF = HashingTF(inputCol='merged', outputCol="features", numFeatures = 2048)
hashed = hashingTF.transform(final)
hashed.show()

+---+--------------------+-------+-------+--------------------+--------------------+
| id|               words|  label|id_list|              merged|            features|
+---+--------------------+-------+-------+--------------------+--------------------+
| 10|            [[none]]|3014847|   [10]|        [10, [none]]|(2048,[465,1969],...|
| 22|            [[none]]|2497549|   [22]|        [22, [none]]|(2048,[465,1325],...|
| 24|["the x factor", ...|5275288|   [24]|[24, "the x facto...|(2048,[45,51,94,2...|
| 27|            [[none]]|  13630|   [27]|        [27, [none]]|(2048,[465,1928],...|
| 25|            [[none]]|   1095|   [25]|        [25, [none]]|(2048,[465,1497],...|
| 17|            [[none]]|   7288|   [17]|        [17, [none]]|(2048,[465,1551],...|
| 17|["philadelphia", ...|  14609|   [17]|[17, "philadelphi...|(2048,[261,696,77...|
| 23|["sports news", "...|  20063|   [23]|[23, "sports news...|(2048,[181,193,14...|
|  1|["weed", "addicti...|  88657|    [1]|[1, "weed", "addi...|(2

In [10]:
trainingData, testData = hashed.randomSplit([0.8, 0.2])

rf = RandomForestRegressor(featuresCol="features", maxDepth=8)

model = rf.fit(trainingData)

predictionsDf = model.transform(testData)
predictionsDf.show()


+---+--------------------+-------+-------+--------------------+--------------------+------------------+
| id|               words|  label|id_list|              merged|            features|        prediction|
+---+--------------------+-------+-------+--------------------+--------------------+------------------+
|  1|["a wrinkle in ti...|  14197|    [1]|[1, "a wrinkle in...|(2048,[34,60,491,...|1561525.7843502255|
|  1|["a wrinkle in ti...|  17253|    [1]|[1, "a wrinkle in...|(2048,[34,60,491,...|1561525.7843502255|
|  1|["a wrinkle in ti...|  18006|    [1]|[1, "a wrinkle in...|(2048,[34,60,491,...|1561525.7843502255|
|  1|["a wrinkle in ti...|1921050|    [1]|[1, "a wrinkle in...|(2048,[60,343,411...|1561525.7843502255|
|  1|["a24", "a24 film...|1500855|    [1]|[1, "a24", "a24 f...|(2048,[25,230,291...|1680840.2286862873|
|  1|["a24", "a24 film...|1797044|    [1]|[1, "a24", "a24 f...|(2048,[25,230,291...|1680840.2286862873|
|  1|["a24", "a24 film...| 703513|    [1]|[1, "a24", "a24 f...|(

In [11]:
tupl = predictionsDf.withColumn('rating', F.struct(predictionsDf.prediction, predictionsDf.label))
tupl = tupl.select(tupl.rating)
tupl.show()

+--------------------+
|              rating|
+--------------------+
|[1854167.24813995...|
|[1320217.57966757...|
|[1320217.57966757...|
|[3033768.04201716...|
|[972018.881174920...|
|[972018.881174920...|
|[3413029.98778842...|
|[3413029.98778842...|
|[3413029.98778842...|
|[3413029.98778842...|
|[3079057.92764185...|
|[3079057.92764185...|
|[3079057.92764185...|
|[3049448.71549754...|
|[1831435.46182333...|
|[2104210.73858578...|
|[2104210.73858578...|
|[2104210.73858578...|
|[2683292.90973709...|
|[2683292.90973709...|
+--------------------+
only showing top 20 rows



In [12]:
metrics = RegressionMetrics(tupl.rdd)

In [11]:
rf_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label",metricName="r2")
rf_evaluator.evaluate(predictionsDf)

0.6111431927056059