In [1]:
from secrets import HADOOP_USER_NAME, SPARK_URI, HADOOP_NAMENODE

In [2]:
import os
os.environ['HADOOP_USER_NAME'] = HADOOP_USER_NAME

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from hdfs import InsecureClient
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [4]:
client_hdfs = InsecureClient(f'http://{HADOOP_NAMENODE}:50070', user=HADOOP_USER_NAME)

In [5]:
# get preprocessed opusdata filename
hdfs_path = "/processed/opusdata_omdb_00.csv"

filename = [f for f in client_hdfs.list(hdfs_path) if f.endswith('.csv')][0]

In [6]:
sc = SparkContext(SPARK_URI)
sparkSession = SparkSession\
    .builder\
    .appName("RandomForestClassifierExample")\
    .getOrCreate()

In [7]:
data = sparkSession.read.csv(
    f"hdfs://{HADOOP_NAMENODE}:8020{hdfs_path}/{filename}", header=True, inferSchema=True
)


In [8]:
columns_to_index = ['rating', 'genre', 'country']

In [9]:
for col in columns_to_index:
    labelIndexer = StringIndexer(inputCol=col, outputCol=f"{col}_indexed").fit(data)
    data = labelIndexer.transform(data)
data.show()

+--------------------+---------------+-----------------+------+-----------------+------+----------------+-------+--------------------+-------+----------+---------+-------------------------------+-----------------------+------------------+-----------+----+-----------------+-----------------------+----------+----------------+----------+----------------+----------+----------+----------+----------+-------+--------------+-------------+---------------+
|          movie_name|production_year|production_budget|rating|            genre|sequel|total_box_office|runtime|            director|country|imdb_votes|  imdb_id|ratings_internet_movie_database|ratings_rotten_tomatoes|ratings_metacritic|nominations|wins|won_golden_globes|nominated_golden_globes|won_oscars|nominated_oscars|won_baftas|nominated_baftas|actor_id_0|actor_id_1|actor_id_2|actor_id_3|success|rating_indexed|genre_indexed|country_indexed|
+--------------------+---------------+-----------------+------+-----------------+------+----------

In [10]:
feature_cols = [
    "sequel",
    "runtime",
    "imdb_votes",
    "ratings_internet_movie_database",
    "ratings_rotten_tomatoes",
    "nominations",
    "wins",
    "won_golden_globes",
    "nominated_golden_globes",
    "won_oscars",
    "nominated_oscars",
    "won_baftas",
    "nominated_baftas",
    "actor_id_0",
    "actor_id_1",
    "actor_id_2",
    "actor_id_3",
    "rating_indexed",
    "genre_indexed",
    "country_indexed",
]

In [11]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") 
assembled_df = assembler.setHandleInvalid("skip").transform(data)

In [12]:
# Split the data into training and test sets (30% held out for testing)
(training_data, test_data) = assembled_df.randomSplit([0.7, 0.3], seed=1234)

In [13]:
num_folds = 5

In [14]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="success", predictionCol="prediction", metricName="accuracy")

In [15]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="success", featuresCol="features", numTrees=500)

In [16]:
paramGrid = (
    ParamGridBuilder().addGrid(param=rf.numTrees, values=[100, 300, 500]).build()
)

crossval = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=num_folds,
    seed=1234,
)

In [17]:
model = crossval.fit(training_data)

In [18]:
predictions_train = model.transform(training_data)

In [19]:
predictions_test = model.transform(test_data)

In [21]:
predictions_test.select('movie_name', 'rawPrediction', 'prediction', 'probability').show(5)

+----------------+--------------------+----------+--------------------+
|      movie_name|       rawPrediction|prediction|         probability|
+----------------+--------------------+----------+--------------------+
|12 Years a Slave|[10.1473193222404...|       1.0|[0.10147319322240...|
|       127 Hours|[8.36548148875023...|       1.0|[0.08365481488750...|
|  22 Jump Street|[16.1721309196209...|       1.0|[0.16172130919620...|
|About Last Night|[55.1612355088167...|       0.0|[0.55161235508816...|
|  American Ultra|[43.9907665953758...|       1.0|[0.43990766595375...|
+----------------+--------------------+----------+--------------------+
only showing top 5 rows



In [22]:
predictions_test.select('probability').collect()[0]

Row(probability=DenseVector([0.1015, 0.8985]))

In [23]:
accuracy = evaluator.evaluate(predictions_train)
print("Train Accuracy = %g" % (accuracy))

Train Accuracy = 0.854545


In [24]:
accuracy = evaluator.evaluate(predictions_test)
print("Test Accuracy = %g" % (accuracy))

Test Accuracy = 0.729885


In [25]:
best_model = model.bestModel

In [26]:
feat_importances = list(((col, imp) for col, imp in zip(feature_cols, best_model.featureImportances.values)))

In [27]:
feat_importances.sort(key=lambda x: x[1], reverse=True)

In [28]:
feat_importances

[('imdb_votes', 0.22764393378476178),
 ('genre_indexed', 0.15842658597130765),
 ('nominations', 0.08876340636030738),
 ('sequel', 0.08351526606190471),
 ('ratings_rotten_tomatoes', 0.07033680619809499),
 ('rating_indexed', 0.0564600268711082),
 ('runtime', 0.05260551598115901),
 ('wins', 0.05042679521912936),
 ('ratings_internet_movie_database', 0.039189450029561715),
 ('nominated_baftas', 0.035365942978525675),
 ('actor_id_1', 0.034825671725164146),
 ('actor_id_2', 0.034421006599139686),
 ('actor_id_0', 0.033552641945929496),
 ('actor_id_3', 0.01803217209641499),
 ('won_oscars', 0.008950525745148468),
 ('nominated_oscars', 0.004289329321917007),
 ('nominated_golden_globes', 0.0024399677370223866),
 ('won_golden_globes', 0.0005136035590450551),
 ('won_baftas', 0.00024135181435823886)]