In [None]:
from secrets import HADOOP_USER_NAME, SPARK_URI, HADOOP_NAMENODE

In [None]:
import os
os.environ['HADOOP_USER_NAME'] = HADOOP_USER_NAME

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F
import pyspark.sql.types as t

In [None]:
sc = SparkContext(SPARK_URI)
sparkSession = (
    SparkSession.builder.appName("processing-opusdata")
    .config("spark.hadoop.dfs.client.use.datanode.hostname", "true")
    .getOrCreate()
)

In [None]:
# Read from hdfs
opusdata = sparkSession.read.csv(
    f"hdfs://{HADOOP_NAMENODE}:8020/raw/opusdata.csv", header=True, inferSchema=True
)
opusdata.show()

# sc = spark.sparkContext

In [None]:
opusdata_filter_0 = opusdata.filter(opusdata["production_budget"] != 0)

In [None]:
opusdata_filter_0 = opusdata_filter_0.filter(
    opusdata_filter_0["domestic_box_office"] != 0
)

In [None]:
opusdata_filter_0 = opusdata_filter_0.filter(
    opusdata_filter_0["international_box_office"] != 0
)

In [None]:
opusdata_dropped = opusdata_filter_0.drop(
    "movie_odid", "running_time", "production_method", "creative_type", "source"
)

In [None]:
opusdata_years = opusdata_dropped.filter(opusdata_dropped["production_year"] >= 2010)

In [None]:
opusdata_distinct = opusdata_years.dropDuplicates(["movie_name", "production_year"])

In [None]:
opusdata_total_box_office = opusdata_distinct.withColumn(
    "total_box_office",
    opusdata_distinct["domestic_box_office"]
    + opusdata_distinct["international_box_office"],
).drop("domestic_box_office", "international_box_office")

In [None]:
opusdata_droppped_na = opusdata_total_box_office.na.drop(subset=["genre", "sequel"])

### Get "success" [1]
[1] _Rhee, Travis Ginmu, and Farhana Zulkernine. "Predicting movie box office profitability: A neural network approach." 2016 15th IEEE International Conference on Machine Learning and Applications (ICMLA). IEEE, 2016._

Profit = (1⁄2 * total_box_office) – production_budget

In [None]:
@F.udf(returnType=t.IntegerType())
def success(arguments):
    total_box_office, production_budget = arguments
    
    profit = (0.5 * total_box_office) - production_budget
    profit_censored = 1 if profit > 0 else 0
    return profit_censored

In [None]:
opusdata_success = opusdata_droppped_na.withColumn(
    "success", success(F.array("total_box_office", "production_budget")))

In [None]:
opusdata_success.repartition(1).write.mode("overwrite").option('header',True).csv(
    f"hdfs://{HADOOP_NAMENODE}:8020/processed/opusdata.csv"
)