In [None]:
from secrets import HADOOP_USER_NAME, SPARK_URI, HADOOP_NAMENODE

In [None]:
import os
os.environ['HADOOP_USER_NAME'] = HADOOP_USER_NAME

In [None]:
import pyspark.sql.functions as F
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from hdfs import InsecureClient
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
client_hdfs = InsecureClient(f'http://{HADOOP_NAMENODE}:50070', user=HADOOP_USER_NAME)

In [None]:
# get preprocessed opusdata filename
hdfs_path = "/processed/opusdata_omdb_00.csv"

filename = [f for f in client_hdfs.list(hdfs_path) if f.endswith('.csv')][0]

In [None]:
conf = SparkConf().set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1")
sc = SparkContext(SPARK_URI, conf=conf)
sparkSession = SparkSession\
    .builder\
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/bigdata.movie_popularity") \
    .appName("RandomForestClassifierExample")\
    .getOrCreate()

In [None]:
data = sparkSession.read.csv(
    f"hdfs://{HADOOP_NAMENODE}:8020{hdfs_path}/{filename}", header=True, inferSchema=True
)


In [None]:
columns_to_index = ['rating', 'genre', 'country']

In [None]:
for col in columns_to_index:
    labelIndexer = StringIndexer(inputCol=col, outputCol=f"{col}_indexed").fit(data)
    data = labelIndexer.transform(data)
data.show()

In [None]:
feature_cols = [
    "sequel",
    "runtime",
    "imdb_votes",
    "ratings_internet_movie_database",
    "ratings_rotten_tomatoes",
    "nominations",
    "wins",
    "won_golden_globes",
    "nominated_golden_globes",
    "won_oscars",
    "nominated_oscars",
    "won_baftas",
    "nominated_baftas",
    "actor_id_0",
    "actor_id_1",
    "actor_id_2",
    "actor_id_3",
    "rating_indexed",
    "genre_indexed",
    "country_indexed",
]

In [None]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") 
assembled_df = assembler.setHandleInvalid("skip").transform(data)

In [None]:
# Split the data into training and test sets (30% held out for testing)
(training_data, test_data) = assembled_df.randomSplit([0.7, 0.3], seed=1234)

In [None]:
num_folds = 5

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="success", predictionCol="prediction", metricName="accuracy")

In [None]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="success", featuresCol="features", numTrees=500)

In [None]:
paramGrid = (
    ParamGridBuilder().addGrid(param=rf.numTrees, values=[100, 300, 500]).build()
)

crossval = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=num_folds,
    seed=1234,
)

In [None]:
model = crossval.fit(training_data)

In [None]:
predictions_train = model.transform(training_data)

In [None]:
predictions_test = model.transform(test_data)

In [None]:
predictions_train.select('movie_name', 'imdb_id', 'prediction', 'probability').show(5)

In [None]:
predictions_test.select('probability').collect()[0]

In [None]:
accuracy = evaluator.evaluate(predictions_train)
print("Train Accuracy = %g" % (accuracy))

In [None]:
accuracy = evaluator.evaluate(predictions_test)
print("Test Accuracy = %g" % (accuracy))

In [None]:
best_model = model.bestModel

In [None]:
feat_importances = list(((col, imp) for col, imp in zip(feature_cols, best_model.featureImportances.values)))

In [None]:
feat_importances.sort(key=lambda x: x[1], reverse=True)

In [None]:
feat_importances

In [None]:
predictions_train_clean = predictions_train.select('movie_name', 'imdb_id', predictions_train['prediction'].cast('integer')) 
predictions_test_clean = predictions_test.select('movie_name', 'imdb_id',  predictions_test['prediction'].cast('integer')) 

In [None]:
predictions_train_clean.show()

In [None]:
predictions_clean = predictions_train_clean.unionByName(predictions_test_clean)

In [None]:
predictions_clean = predictions_clean.withColumn("updated_on", F.current_date())

In [None]:
predictions_clean.show()

In [None]:
predictions_clean.write.format("mongo").mode("append").save()