In [1]:
from secrets import HADOOP_USER_NAME, SPARK_URI, HADOOP_NAMENODE

In [2]:
import os
os.environ['HADOOP_USER_NAME'] = HADOOP_USER_NAME

In [3]:
import pyspark.sql.functions as F
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from hdfs import InsecureClient
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [4]:
client_hdfs = InsecureClient(f'http://{HADOOP_NAMENODE}:50070', user=HADOOP_USER_NAME)

In [5]:
# get preprocessed opusdata filename
hdfs_path = "/processed/opusdata_omdb_00.csv"

filename = [f for f in client_hdfs.list(hdfs_path) if f.endswith('.csv')][0]

In [6]:
conf = SparkConf().set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1")
sc = SparkContext(SPARK_URI, conf=conf)
sparkSession = SparkSession\
    .builder\
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.movie_popularity") \
    .appName("RandomForestClassifierExample")\
    .getOrCreate()

['/home/utente/spark-2.4.5-bin-hadoop2.7/./bin/spark-submit', '--conf', 'spark.jars.packages=org.mongodb.spark:mongo-spark-connector_2.11:2.4.1', 'pyspark-shell'] {'CONDA_SHLVL': '2', 'LS_COLORS': 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;

In [7]:
data = sparkSession.read.csv(
    f"hdfs://{HADOOP_NAMENODE}:8020{hdfs_path}/{filename}", header=True, inferSchema=True
)


In [8]:
columns_to_index = ['rating', 'genre', 'country']

In [9]:
for col in columns_to_index:
    labelIndexer = StringIndexer(inputCol=col, outputCol=f"{col}_indexed").fit(data)
    data = labelIndexer.transform(data)
data.show()

+--------------------+---------------+-----------------+------+-----------------+------+----------------+-------+--------------------+-------+----------+---------+-------------------------------+-----------------------+------------------+-----------+----+-----------------+-----------------------+----------+----------------+----------+----------------+----------+----------+----------+----------+-------+--------------+-------------+---------------+
|          movie_name|production_year|production_budget|rating|            genre|sequel|total_box_office|runtime|            director|country|imdb_votes|  imdb_id|ratings_internet_movie_database|ratings_rotten_tomatoes|ratings_metacritic|nominations|wins|won_golden_globes|nominated_golden_globes|won_oscars|nominated_oscars|won_baftas|nominated_baftas|actor_id_0|actor_id_1|actor_id_2|actor_id_3|success|rating_indexed|genre_indexed|country_indexed|
+--------------------+---------------+-----------------+------+-----------------+------+----------

In [10]:
feature_cols = [
    "sequel",
    "runtime",
    "imdb_votes",
    "ratings_internet_movie_database",
    "ratings_rotten_tomatoes",
    "nominations",
    "wins",
    "won_golden_globes",
    "nominated_golden_globes",
    "won_oscars",
    "nominated_oscars",
    "won_baftas",
    "nominated_baftas",
    "actor_id_0",
    "actor_id_1",
    "actor_id_2",
    "actor_id_3",
    "rating_indexed",
    "genre_indexed",
    "country_indexed",
]

In [11]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") 
assembled_df = assembler.setHandleInvalid("skip").transform(data)

In [12]:
# Split the data into training and test sets (30% held out for testing)
(training_data, test_data) = assembled_df.randomSplit([0.7, 0.3], seed=1234)

In [13]:
num_folds = 5

In [14]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="success", predictionCol="prediction", metricName="accuracy")

In [15]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="success", featuresCol="features", numTrees=500)

In [16]:
paramGrid = (
    ParamGridBuilder().addGrid(param=rf.numTrees, values=[100, 300, 500]).build()
)

crossval = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=num_folds,
    seed=1234,
)

In [17]:
model = crossval.fit(training_data)

In [18]:
predictions_train = model.transform(training_data)

In [19]:
predictions_test = model.transform(test_data)

In [20]:
predictions_train.select('movie_name', 'imdb_id', 'prediction', 'probability').show(5)

+--------------------+---------+----------+--------------------+
|          movie_name|  imdb_id|prediction|         probability|
+--------------------+---------+----------+--------------------+
|  30 Minutes or Less|tt1622547|       0.0|[0.55247573694000...|
|            47 Ronin|tt1335975|       1.0|[0.49470116593373...|
|       5 Days of War|tt1486193|       0.0|[0.78503742920047...|
|A Bad Moms Christmas|tt6359956|       1.0|[0.26140076374327...|
|  A Dangerous Method|tt1571222|       1.0|[0.46332827877048...|
+--------------------+---------+----------+--------------------+
only showing top 5 rows



In [21]:
predictions_test.select('probability').collect()[0]

Row(probability=DenseVector([0.1079, 0.8921]))

In [22]:
accuracy = evaluator.evaluate(predictions_train)
print("Train Accuracy = %g" % (accuracy))

Train Accuracy = 0.847727


In [23]:
accuracy = evaluator.evaluate(predictions_test)
print("Test Accuracy = %g" % (accuracy))

Test Accuracy = 0.741379


In [24]:
best_model = model.bestModel

In [25]:
feat_importances = list(((col, imp) for col, imp in zip(feature_cols, best_model.featureImportances.values)))

In [26]:
feat_importances.sort(key=lambda x: x[1], reverse=True)

In [27]:
feat_importances

[('imdb_votes', 0.2226554191538051),
 ('genre_indexed', 0.13307773276872722),
 ('nominations', 0.09781662859600385),
 ('sequel', 0.07678091299930563),
 ('rating_indexed', 0.06798217197761458),
 ('ratings_rotten_tomatoes', 0.060881318087953364),
 ('runtime', 0.05349233605587218),
 ('ratings_internet_movie_database', 0.050968474060129054),
 ('wins', 0.049490174589421815),
 ('actor_id_2', 0.041136655326437954),
 ('actor_id_1', 0.038232649195723716),
 ('actor_id_0', 0.03805951472506702),
 ('nominated_baftas', 0.03547734677419303),
 ('actor_id_3', 0.017780239310818832),
 ('won_oscars', 0.010660738892726791),
 ('nominated_oscars', 0.003745221762519059),
 ('nominated_golden_globes', 0.0015829076275183243),
 ('won_golden_globes', 9.61421132707854e-05),
 ('won_baftas', 8.341598289149327e-05)]

In [28]:
predictions_train_clean = predictions_train.select('movie_name', 'imdb_id', predictions_train['prediction'].cast('integer')) 
predictions_test_clean = predictions_test.select('movie_name', 'imdb_id',  predictions_test['prediction'].cast('integer')) 

In [29]:
predictions_train_clean.show()

+--------------------+---------+----------+
|          movie_name|  imdb_id|prediction|
+--------------------+---------+----------+
|  30 Minutes or Less|tt1622547|         0|
|            47 Ronin|tt1335975|         1|
|       5 Days of War|tt1486193|         0|
|A Bad Moms Christmas|tt6359956|         1|
|  A Dangerous Method|tt1571222|         1|
|A Million Ways to...|tt2557490|         1|
|     A Monster Calls|tt3416532|         0|
| A Most Violent Year|tt2937898|         0|
|   A Most Wanted Man|tt1972571|         0|
|A Nightmare on El...|tt1179056|         1|
|          About Time|tt2194499|         1|
|          Alex Cross|tt1712170|         0|
|Alice Through the...|tt2567026|         1|
| Alice in Wonderland|tt1014759|         1|
|     All Good Things|tt1175709|         0|
|              Allied|tt3640424|         1|
|    Almost Christmas|tt4649416|         0|
|Alvin and the Chi...|tt1615918|         1|
|     American Hustle|tt1800241|         1|
|       American Made|tt3532216|

In [30]:
predictions_clean = predictions_train_clean.unionByName(predictions_test_clean)

In [31]:
predictions_clean = predictions_clean.withColumn("updated_on", F.current_date())

In [32]:
predictions_clean.show()

+--------------------+---------+----------+----------+
|          movie_name|  imdb_id|prediction|updated_on|
+--------------------+---------+----------+----------+
|  30 Minutes or Less|tt1622547|         0|2020-05-28|
|            47 Ronin|tt1335975|         1|2020-05-28|
|       5 Days of War|tt1486193|         0|2020-05-28|
|A Bad Moms Christmas|tt6359956|         1|2020-05-28|
|  A Dangerous Method|tt1571222|         1|2020-05-28|
|A Million Ways to...|tt2557490|         1|2020-05-28|
|     A Monster Calls|tt3416532|         0|2020-05-28|
| A Most Violent Year|tt2937898|         0|2020-05-28|
|   A Most Wanted Man|tt1972571|         0|2020-05-28|
|A Nightmare on El...|tt1179056|         1|2020-05-28|
|          About Time|tt2194499|         1|2020-05-28|
|          Alex Cross|tt1712170|         0|2020-05-28|
|Alice Through the...|tt2567026|         1|2020-05-28|
| Alice in Wonderland|tt1014759|         1|2020-05-28|
|     All Good Things|tt1175709|         0|2020-05-28|
|         

In [33]:
predictions_clean.write.format("mongo").mode("append").save()