In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkConf, SparkContext
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor, GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Read data for training

In [2]:

DATA_PATH = 'gs://6893_bucket/project/all_movie_data_with_tweet_imdb_sentiment.csv'

# conf = SparkConf()
# sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)
data = sqlContext.read.csv(DATA_PATH, inferSchema=True, header=True)
data.show(3)

21/12/22 19:57:27 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---+---------------+----+-----------+----------+--------------------+-------------------+------+----------+-----------+--------------------+------+---------+---------+---------+------+-----+-----+-------+-------+------+-----+-------+-------+------+-----+--------+---+-------+------------------+------------------+------------------+-----------------+------------------+-------------------+--------------------+-------------------+
|_c0|           name|year|movie_rated|run_length|              genres|       release_date|rating|num_raters|num_reviews|          review_url|Action|Adventure|Animation|Biography|Comedy|Crime|Drama|Fantasy|History|Horror|Music|Mystery|Romance|Sci-Fi|Sport|Thriller|War|Western|               pos|               neg|           neutral|    pos_neg_ratio|         tweet_pos|          tweet_neg|       tweet_neutral|tweet_pos_neg_ratio|
+---+---------------+----+-----------+----------+--------------------+-------------------+------+----------+-----------+----------------

In [3]:
data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- movie_rated: string (nullable = true)
 |-- run_length: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- num_raters: integer (nullable = true)
 |-- num_reviews: integer (nullable = true)
 |-- review_url: string (nullable = true)
 |-- Action: integer (nullable = true)
 |-- Adventure: integer (nullable = true)
 |-- Animation: integer (nullable = true)
 |-- Biography: integer (nullable = true)
 |-- Comedy: integer (nullable = true)
 |-- Crime: integer (nullable = true)
 |-- Drama: integer (nullable = true)
 |-- Fantasy: integer (nullable = true)
 |-- History: integer (nullable = true)
 |-- Horror: integer (nullable = true)
 |-- Music: integer (nullable = true)
 |-- Mystery: integer (nullable = true)
 |-- Romance: integer (nullable = true)
 |-- Sci-Fi: integer (nullable = tr

# Define Models for experiments

In [6]:
lr = LinearRegression(labelCol='rating', maxIter=10)
dt = DecisionTreeRegressor(labelCol='rating')
rf = RandomForestRegressor(labelCol='rating')
gbt = GBTRegressor(labelCol='rating')
# glr = GeneralizedLinearRegression(labelCol='rating')
lr_ridge = LinearRegression(labelCol='rating', maxIter=10, elasticNetParam=1, regParam=0.005)
evaluator = RegressionEvaluator(labelCol='rating', predictionCol='prediction', metricName='rmse')

# 1. Using all features: IMDb sentiments + IMDb Metadata + Tweet sentiments

In [7]:
# numericCols + genreCols + sentimentCols + tweetSentimentCols
rmse = []
dataset = data
stages = []
categoricalColumns = ['movie_rated']
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], 
                            outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

numericCols = ['year', 'num_raters', 'num_reviews']
genreCols = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 
             'Crime', 'Drama', 'Fantasy', 'History', 'Horror', 'Music', 
             'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']
sentimentCols = ['pos', 'neg', 'neutral', 'pos_neg_ratio']
tweetSentimentCols = ['tweet_pos', 'tweet_neg', 'tweet_neutral', 
                      'tweet_pos_neg_ratio']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols + genreCols + sentimentCols + tweetSentimentCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages = stages + [assembler]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(dataset)
preppedDataDF = pipelineModel.transform(dataset)
cols = dataset.columns
selectedcols = ["features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)
trainingData, testData = dataset.randomSplit(weights=[0.85, 0.15], seed=100)

print('training data count', trainingData.count())
print('test data count', testData.count())

print('Fitting Linear Regression')
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Ridge Regression')
lrridgeModel = lr_ridge.fit(trainingData)
predictions = lrridgeModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Decision Trees')
dtModel = dt.fit(trainingData)
predictions = dtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Random Forest')
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting GBTs')
gbtModel = gbt.fit(trainingData)
predictions = gbtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

# print('Fitting GLR')
# glrModel = glr.fit(trainingData)
# predictions = glrModel.transform(testData)
# rmse.append(evaluator.evaluate(predictions))
print('Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs')
print('Error: ', rmse)

DataFrame[features: vector, _c0: int, name: string, year: int, movie_rated: string, run_length: string, genres: string, release_date: string, rating: double, num_raters: int, num_reviews: int, review_url: string, Action: int, Adventure: int, Animation: int, Biography: int, Comedy: int, Crime: int, Drama: int, Fantasy: int, History: int, Horror: int, Music: int, Mystery: int, Romance: int, Sci-Fi: int, Sport: int, Thriller: int, War: int, Western: int, pos: double, neg: double, neutral: double, pos_neg_ratio: double, tweet_pos: double, tweet_neg: double, tweet_neutral: double, tweet_pos_neg_ratio: double]

training data count 990
test data count 155
Fitting Linear Regression


21/12/22 19:59:47 WARN org.apache.spark.ml.util.Instrumentation: [4470533d] regParam is zero, which might cause numerical instability and overfitting.
21/12/22 19:59:48 WARN org.apache.spark.ml.util.Instrumentation: [4470533d] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


Fitting Ridge Regression
Fitting Decision Trees
Fitting Random Forest
Fitting GBTs
Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs
Error:  [0.39407731281395264, 0.39226265284466627, 0.46411417072625494, 0.4301824479601999, 0.4069568078527399]


# 2. Using IMDb Metadata + Tweet sentiments as features

In [8]:
# numericCols + genreCols + tweetSentimentCols
rmse = []
dataset = data
stages = []
categoricalColumns = ['movie_rated']
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

numericCols = ['year', 'num_raters', 'num_reviews']
genreCols = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy', 
        'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']
sentimentCols = ['pos', 'neg', 'neutral', 'pos_neg_ratio']
tweetSentimentCols = ['tweet_pos', 'tweet_neg', 'tweet_neutral', 'tweet_pos_neg_ratio']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols + genreCols + tweetSentimentCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages = stages + [assembler]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(dataset)
preppedDataDF = pipelineModel.transform(dataset)
cols = dataset.columns
selectedcols = ["features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)
trainingData, testData = dataset.randomSplit(weights=[0.85, 0.15], seed=100)

print('training data count', trainingData.count())
print('test data count', testData.count())
print('Fitting Linear Regression')

lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Ridge Regression')
lrridgeModel = lr_ridge.fit(trainingData)
predictions = lrridgeModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Decision Trees')
dtModel = dt.fit(trainingData)
predictions = dtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Random Forest')
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting GBTs')
gbtModel = gbt.fit(trainingData)
predictions = gbtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

# print('Fitting GLR')
# glrModel = glr.fit(trainingData)
# predictions = glrModel.transform(testData)
# rmse.append(evaluator.evaluate(predictions))
print('Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs')
print('Error: ', rmse)

DataFrame[features: vector, _c0: int, name: string, year: int, movie_rated: string, run_length: string, genres: string, release_date: string, rating: double, num_raters: int, num_reviews: int, review_url: string, Action: int, Adventure: int, Animation: int, Biography: int, Comedy: int, Crime: int, Drama: int, Fantasy: int, History: int, Horror: int, Music: int, Mystery: int, Romance: int, Sci-Fi: int, Sport: int, Thriller: int, War: int, Western: int, pos: double, neg: double, neutral: double, pos_neg_ratio: double, tweet_pos: double, tweet_neg: double, tweet_neutral: double, tweet_pos_neg_ratio: double]

training data count 990
test data count 155
Fitting Linear Regression


21/12/22 20:07:33 WARN org.apache.spark.ml.util.Instrumentation: [0be00314] regParam is zero, which might cause numerical instability and overfitting.
21/12/22 20:07:33 WARN org.apache.spark.ml.util.Instrumentation: [0be00314] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


Fitting Ridge Regression
Fitting Decision Trees
Fitting Random Forest
Fitting GBTs
Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs
Error:  [0.506439565380298, 0.5062682506160198, 0.6134001963912572, 0.5210772542345369, 0.5941677442453148]


# 3. Using IMDb Metadata + IMDb sentiments as features

In [47]:
# numericCols + genreCols + sentimentCols
dataset = data
stages = []
categoricalColumns = ['movie_rated']
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

numericCols = ['year', 'num_raters', 'num_reviews']
genreCols = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy', 
        'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']
sentimentCols = ['pos', 'neg', 'neutral', 'pos_neg_ratio']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols + genreCols + sentimentCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages = stages + [assembler]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(dataset)
preppedDataDF = pipelineModel.transform(dataset)
cols = dataset.columns
selectedcols = ["features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)
trainingData, testData = dataset.randomSplit(weights=[0.85, 0.15], seed=100)

print('training data count', trainingData.count())
print('test data count', testData.count())

rmse = []
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Ridge Regression')
lrridgeModel = lr_ridge.fit(trainingData)
predictions = lrridgeModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Decision Trees')
dtModel = dt.fit(trainingData)
predictions = dtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Random Forest')
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting GBTs')
gbtModel = gbt.fit(trainingData)
predictions = gbtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

# print('Fitting GLR')
# glrModel = glr.fit(trainingData)
# predictions = glrModel.transform(testData)
# rmse.append(evaluator.evaluate(predictions))
print('Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs')
print('Error: ', rmse)

DataFrame[features: vector, _c0: int, name: string, year: int, movie_rated: string, run_length: string, genres: string, release_date: string, rating: double, num_raters: int, num_reviews: int, review_url: string, Action: int, Adventure: int, Animation: int, Biography: int, Comedy: int, Crime: int, Drama: int, Fantasy: int, History: int, Horror: int, Music: int, Mystery: int, Romance: int, Sci-Fi: int, Sport: int, Thriller: int, War: int, Western: int, pos: double, neg: double, neutral: double, pos_neg_ratio: double, tweet_pos: double, tweet_neg: double, tweet_neutral: double, tweet_pos_neg_ratio: double]

training data count 990
test data count 155


21/12/17 16:55:43 WARN org.apache.spark.ml.util.Instrumentation: [8780ac9c] regParam is zero, which might cause numerical instability and overfitting.
21/12/17 16:55:43 WARN org.apache.spark.ml.util.Instrumentation: [8780ac9c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


Fitting Ridge Regression
Fitting Decision Trees
Fitting Random Forest
Fitting GBTs
Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs
Error:  [0.37240843328857465, 0.37160094491028195, 0.4421741250217632, 0.40106209651668395, 0.40372598231200846]


# 4. Using IMDb Metadata as features (dropping sentiments)

In [48]:
# numericCols + genreCols
dataset = data
stages = []
categoricalColumns = ['movie_rated']
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

numericCols = ['year', 'num_raters', 'num_reviews']
genreCols = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy', 
        'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']
sentimentCols = ['pos', 'neg', 'neutral', 'pos_neg_ratio']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols + genreCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages = stages + [assembler]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(dataset)
preppedDataDF = pipelineModel.transform(dataset)
cols = dataset.columns
selectedcols = ["features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)
trainingData, testData = dataset.randomSplit(weights=[0.85, 0.15], seed=100)

print('training data count', trainingData.count())
print('test data count', testData.count())

rmse = []
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Ridge Regression')
lrridgeModel = lr_ridge.fit(trainingData)
predictions = lrridgeModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Decision Trees')
dtModel = dt.fit(trainingData)
predictions = dtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Random Forest')
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting GBTs')
gbtModel = gbt.fit(trainingData)
predictions = gbtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

# print('Fitting GLR')
# glrModel = glr.fit(trainingData)
# predictions = glrModel.transform(testData)
# rmse.append(evaluator.evaluate(predictions))
print('Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs')
print('Error: ', rmse)


DataFrame[features: vector, _c0: int, name: string, year: int, movie_rated: string, run_length: string, genres: string, release_date: string, rating: double, num_raters: int, num_reviews: int, review_url: string, Action: int, Adventure: int, Animation: int, Biography: int, Comedy: int, Crime: int, Drama: int, Fantasy: int, History: int, Horror: int, Music: int, Mystery: int, Romance: int, Sci-Fi: int, Sport: int, Thriller: int, War: int, Western: int, pos: double, neg: double, neutral: double, pos_neg_ratio: double, tweet_pos: double, tweet_neg: double, tweet_neutral: double, tweet_pos_neg_ratio: double]

training data count 990
test data count 155


21/12/17 16:56:06 WARN org.apache.spark.ml.util.Instrumentation: [62b78b46] regParam is zero, which might cause numerical instability and overfitting.


Fitting Ridge Regression
Fitting Decision Trees
Fitting Random Forest
Fitting GBTs
Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs
Error:  [0.5168388194986238, 0.5165219391102391, 0.5830288497318813, 0.5352417506351766, 0.5253910652231695]


# 5. Using partial IMDb Metadata as features (Dropping genre and sentiments)

In [49]:
# numericCols
dataset = data
stages = []
categoricalColumns = ['movie_rated']
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

numericCols = ['year', 'num_raters', 'num_reviews']
genreCols = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy', 
        'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']
sentimentCols = ['pos', 'neg', 'neutral', 'pos_neg_ratio']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages = stages + [assembler]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(dataset)
preppedDataDF = pipelineModel.transform(dataset)
cols = dataset.columns
selectedcols = ["features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)
trainingData, testData = dataset.randomSplit(weights=[0.85, 0.15], seed=100)

print('training data count', trainingData.count())
print('test data count', testData.count())
rmse = []
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Ridge Regression')
lrridgeModel = lr_ridge.fit(trainingData)
predictions = lrridgeModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Decision Trees')
dtModel = dt.fit(trainingData)
predictions = dtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Random Forest')
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting GBTs')
gbtModel = gbt.fit(trainingData)
predictions = gbtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

# print('Fitting GLR')
# glrModel = glr.fit(trainingData)
# predictions = glrModel.transform(testData)
# rmse.append(evaluator.evaluate(predictions))
print('Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs')
print('Error: ', rmse)

DataFrame[features: vector, _c0: int, name: string, year: int, movie_rated: string, run_length: string, genres: string, release_date: string, rating: double, num_raters: int, num_reviews: int, review_url: string, Action: int, Adventure: int, Animation: int, Biography: int, Comedy: int, Crime: int, Drama: int, Fantasy: int, History: int, Horror: int, Music: int, Mystery: int, Romance: int, Sci-Fi: int, Sport: int, Thriller: int, War: int, Western: int, pos: double, neg: double, neutral: double, pos_neg_ratio: double, tweet_pos: double, tweet_neg: double, tweet_neutral: double, tweet_pos_neg_ratio: double]

training data count 990
test data count 155


21/12/17 16:56:19 WARN org.apache.spark.ml.util.Instrumentation: [165078a0] regParam is zero, which might cause numerical instability and overfitting.


Fitting Ridge Regression
Fitting Decision Trees
Fitting Random Forest
Fitting GBTs
Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs
Error:  [0.6015184079478466, 0.6036145055515468, 0.6224879179215658, 0.6088167848136277, 0.6055161504057649]


# Target predictions and evaluation

In [9]:
DATA_PATH = 'gs://6893_bucket/project/target_movies.csv'

target_data = sqlContext.read.csv(DATA_PATH, inferSchema=True, header=True)
target_data.show(3)

+---+--------------+----+-----------+----------+--------------------+------------+------+----------+-----------+----------+------+---------+---------+---------+------+-----+-----+-------+-------+------+-----+-------+-------+------+-----+--------+---+-------+---+---+-------+-------------+-----------+-----------+-------------+-------------------+
|_c0|          name|year|movie_rated|run_length|              genres|release_date|rating|num_raters|num_reviews|review_url|Action|Adventure|Animation|Biography|Comedy|Crime|Drama|Fantasy|History|Horror|Music|Mystery|Romance|Sci-Fi|Sport|Thriller|War|Western|pos|neg|neutral|pos_neg_ratio|  tweet_pos|  tweet_neg|tweet_neutral|tweet_pos_neg_ratio|
+---+--------------+----+-----------+----------+--------------------+------------+------+----------+-----------+----------+------+---------+---------+---------+------+-----+-----+-------+-------+------+-----+-------+-------+------+-----+--------+---+-------+---+---+-------+-------------+-----------+------

In [51]:
# numericCols + genreCols + tweetSentimentCols
rmse = []
dataset = data
stages = []
categoricalColumns = ['movie_rated']
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

numericCols = ['year', 'num_raters', 'num_reviews']
genreCols = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy', 
        'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']
sentimentCols = ['pos', 'neg', 'neutral', 'pos_neg_ratio']
tweetSentimentCols = ['tweet_pos', 'tweet_neg', 'tweet_neutral', 'tweet_pos_neg_ratio']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols + genreCols + tweetSentimentCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages = stages + [assembler]
pipeline = Pipeline(stages=stages)

pipelineModel = pipeline.fit(dataset)
preppedDataDF = pipelineModel.transform(dataset)
cols = dataset.columns
selectedcols = ["features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)
trainingData, testData = dataset.randomSplit(weights=[0.85, 0.15], seed=100)

print('training data count', trainingData.count())
print('test data count', testData.count())
print('Fitting Linear Regression')

lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Ridge Regression')
lrridgeModel = lr_ridge.fit(trainingData)
predictions = lrridgeModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Decision Trees')
dtModel = dt.fit(trainingData)
predictions = dtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting Random Forest')
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

print('Fitting GBTs')
gbtModel = gbt.fit(trainingData)
predictions = gbtModel.transform(testData)
rmse.append(evaluator.evaluate(predictions))

# print('Fitting GLR')
# glrModel = glr.fit(trainingData)
# predictions = glrModel.transform(testData)
# rmse.append(evaluator.evaluate(predictions))
print('Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs')
print('Error: ', rmse)

DataFrame[features: vector, _c0: int, name: string, year: int, movie_rated: string, run_length: string, genres: string, release_date: string, rating: double, num_raters: int, num_reviews: int, review_url: string, Action: int, Adventure: int, Animation: int, Biography: int, Comedy: int, Crime: int, Drama: int, Fantasy: int, History: int, Horror: int, Music: int, Mystery: int, Romance: int, Sci-Fi: int, Sport: int, Thriller: int, War: int, Western: int, pos: double, neg: double, neutral: double, pos_neg_ratio: double, tweet_pos: double, tweet_neg: double, tweet_neutral: double, tweet_pos_neg_ratio: double]

training data count 990
test data count 155
Fitting Linear Regression


21/12/17 18:03:34 WARN org.apache.spark.ml.util.Instrumentation: [6dcd38dd] regParam is zero, which might cause numerical instability and overfitting.
21/12/17 18:03:34 WARN org.apache.spark.ml.util.Instrumentation: [6dcd38dd] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


Fitting Ridge Regression
Fitting Decision Trees
Fitting Random Forest
Fitting GBTs
Linear Regression, Ridge Regression, Decision Trees, Random Forests, GBTs
Error:  [0.506439565380298, 0.5062682506160198, 0.6134001963912572, 0.5210772542345369, 0.5941677442453148]


In [10]:
target_dataset = target_data
preppedTargetDataDF = pipelineModel.transform(target_dataset)
target_dataset = preppedTargetDataDF.select(selectedcols)
display(target_dataset)

DataFrame[features: vector, _c0: int, name: string, year: int, movie_rated: string, run_length: string, genres: string, release_date: string, rating: double, num_raters: int, num_reviews: int, review_url: string, Action: int, Adventure: int, Animation: int, Biography: int, Comedy: int, Crime: int, Drama: int, Fantasy: int, History: int, Horror: int, Music: int, Mystery: int, Romance: int, Sci-Fi: int, Sport: int, Thriller: int, War: int, Western: int, pos: int, neg: int, neutral: int, pos_neg_ratio: int, tweet_pos: double, tweet_neg: double, tweet_neutral: double, tweet_pos_neg_ratio: double]

In [11]:
latest_movie_preds = lrModel.transform(target_dataset)

In [64]:
predictions.take(5)

[Row(features=SparseVector(38, {0: 1.0, 13: 2015.0, 14: 678829.0, 15: 1594.0, 16: 1.0, 17: 1.0, 22: 1.0, 34: 0.6783, 35: 0.2762, 36: 0.0455, 37: 2.456}), _c0=38, name='The Revenant', year=2015, movie_rated='R', run_length='2h 36min', genres='Action; Adventure; Drama; ', release_date='8 January 2016 (USA)', rating=8.0, num_raters=678829, num_reviews=1594, review_url='https://www.imdb.com/title/tt1663202/reviews/_ajax?ref_=undefined&paginationKey=', Action=1, Adventure=1, Animation=0, Biography=0, Comedy=0, Crime=0, Drama=1, Fantasy=0, History=0, Horror=0, Music=0, Mystery=0, Romance=0, Sci-Fi=0, Sport=0, Thriller=0, War=0, Western=0, pos=0.7181933842239185, neg=0.2735368956743002, neutral=0.0082697201017811, pos_neg_ratio=2.625581395348837, tweet_pos=0.6783004552352049, tweet_neg=0.27617602427921095, tweet_neutral=0.04552352048558422, tweet_pos_neg_ratio=2.456043956043956, prediction=7.482564143267479),
 Row(features=SparseVector(38, {0: 1.0, 13: 1995.0, 14: 355455.0, 15: 384.0, 16: 1.0