In [1]:
from secrets import HADOOP_USER_NAME, SPARK_URI, HADOOP_NAMENODE

In [2]:
import os
os.environ['HADOOP_USER_NAME'] = HADOOP_USER_NAME

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F
import pyspark.sql.types as t

In [4]:
sc = SparkContext(SPARK_URI)
sparkSession = (
    SparkSession.builder.appName("processing-opusdata")
    .config("spark.hadoop.dfs.client.use.datanode.hostname", "true")
    .getOrCreate()
)

['/home/utente/spark-2.4.5-bin-hadoop2.7/./bin/spark-submit', 'pyspark-shell'] {'CONDA_SHLVL': '2', 'LS_COLORS': 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01

In [None]:
# Read from hdfs
opusdata = sparkSession.read.csv(
    f"hdfs://{HADOOP_NAMENODE}:8020/raw/opusdata.csv", header=True, inferSchema=True
)
opusdata.show()

# sc = spark.sparkContext

In [None]:
opusdata_filter_0 = opusdata.filter(opusdata["production_budget"] != 0)

In [None]:
opusdata_filter_0 = opusdata_filter_0.filter(
    opusdata_filter_0["domestic_box_office"] != 0
)

In [None]:
opusdata_filter_0 = opusdata_filter_0.filter(
    opusdata_filter_0["international_box_office"] != 0
)

In [None]:
opusdata_dropped = opusdata_filter_0.drop(
    "movie_odid", "running_time", "production_method", "creative_type", "source"
)

In [None]:
opusdata_years = opusdata_dropped.filter(opusdata_dropped["production_year"] >= 2010)

In [None]:
opusdata_distinct = opusdata_years.dropDuplicates(["movie_name", "production_year"])

In [None]:
opusdata_total_box_office = opusdata_distinct.withColumn(
    "total_box_office",
    opusdata_distinct["domestic_box_office"]
    + opusdata_distinct["international_box_office"],
).drop("domestic_box_office", "international_box_office")

In [None]:
opusdata_droppped_na = opusdata_total_box_office.na.drop(subset=["genre", "sequel"])

### Get "success" [1]
[1] _Rhee, Travis Ginmu, and Farhana Zulkernine. "Predicting movie box office profitability: A neural network approach." 2016 15th IEEE International Conference on Machine Learning and Applications (ICMLA). IEEE, 2016._

Profit = (1⁄2 * total_box_office) – production_budget

In [None]:
@F.udf(returnType=t.IntegerType())
def success(arguments):
    total_box_office, production_budget = arguments
    
    profit = (0.5 * total_box_office) - production_budget
    profit_censored = 1 if profit > 0 else 0
    return profit_censored

In [None]:
opusdata_success = opusdata_droppped_na.withColumn(
    "success", success(F.array("total_box_office", "production_budget")))

In [None]:
opusdata_success.repartition(1).write.mode("overwrite").option('header',True).csv(
    f"hdfs://{HADOOP_NAMENODE}:8020/processed/opusdata.csv"
)