In [4]:
# import dependencies for spark
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkFiles

from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

In [5]:
# start spark session

spark = SparkSession.builder\
    .appName("SparkSQL")\
    .config("spark.sql.debug.maxToStringFields", 2000)\
    .config("spark.driver.memory", "2g")\
    .getOrCreate()

In [6]:
# read in data from csv file
file = "Resources/movies.csv"
spark.sparkContext.addFile(file)

In [7]:
# assign file a variable and read in file
from pyspark import SparkFiles
df = spark.read.csv(SparkFiles.get("movies.csv"), sep=",", header=True)

# show dataframe
df.show()

+--------------------+------+---------+----+--------------------+-----+---------+------------------+--------------------+-----------------+--------------+----------+-----------+--------------------+-------+
|                name|rating|    genre|year|            released|score|    votes|          director|              writer|             star|       country|    budget|      gross|             company|runtime|
+--------------------+------+---------+----+--------------------+-----+---------+------------------+--------------------+-----------------+--------------+----------+-----------+--------------------+-------+
|         The Shining|     R|    Drama|1980|June 13, 1980 (Un...|  8.4| 927000.0|   Stanley Kubrick|        Stephen King|   Jack Nicholson|United Kingdom|19000000.0| 46998772.0|        Warner Bros.|  146.0|
|     The Blue Lagoon|     R|Adventure|1980|July 2, 1980 (Uni...|  5.8|  65000.0|    Randal Kleiser|Henry De Vere Sta...|   Brooke Shields| United States| 4500000.0| 588531

In [8]:
# remove rows with null values
df = df.dropna()
df.show()

+--------------------+------+---------+----+--------------------+-----+---------+------------------+--------------------+------------------+--------------+----------+-----------+--------------------+-------+
|                name|rating|    genre|year|            released|score|    votes|          director|              writer|              star|       country|    budget|      gross|             company|runtime|
+--------------------+------+---------+----+--------------------+-----+---------+------------------+--------------------+------------------+--------------+----------+-----------+--------------------+-------+
|         The Shining|     R|    Drama|1980|June 13, 1980 (Un...|  8.4| 927000.0|   Stanley Kubrick|        Stephen King|    Jack Nicholson|United Kingdom|19000000.0| 46998772.0|        Warner Bros.|  146.0|
|     The Blue Lagoon|     R|Adventure|1980|July 2, 1980 (Uni...|  5.8|  65000.0|    Randal Kleiser|Henry De Vere Sta...|    Brooke Shields| United States| 4500000.0| 5

In [9]:
# convert to pandas df
movies_cleaned = df.toPandas()

movies_cleaned.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [10]:
# Removing the four extreme outliers

movies_cleaned = movies_cleaned.drop(movies_cleaned[movies_cleaned["name"] == "Paranormal Activity"].index)
movies_cleaned = movies_cleaned.drop(movies_cleaned[movies_cleaned["name"] == "The Blair Witch Project"].index)
movies_cleaned = movies_cleaned.drop(movies_cleaned[movies_cleaned["name"] == "The Gallows"].index)
movies_cleaned = movies_cleaned.drop(movies_cleaned[movies_cleaned["name"] == "El Mariachi"].index)

movies_cleaned

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5416,Bad Boys for Life,R,Action,2020,"January 17, 2020 (United States)",6.6,140000.0,Adil El Arbi,Peter Craig,Will Smith,United States,90000000.0,426505244.0,Columbia Pictures,124.0
5417,Sonic the Hedgehog,PG,Action,2020,"February 14, 2020 (United States)",6.5,102000.0,Jeff Fowler,Pat Casey,Ben Schwartz,United States,85000000.0,319715683.0,Paramount Pictures,99.0
5418,Dolittle,PG,Adventure,2020,"January 17, 2020 (United States)",5.6,53000.0,Stephen Gaghan,Stephen Gaghan,Robert Downey Jr.,United States,175000000.0,245487753.0,Universal Pictures,101.0
5419,The Call of the Wild,PG,Adventure,2020,"February 21, 2020 (United States)",6.8,42000.0,Chris Sanders,Michael Green,Harrison Ford,Canada,135000000.0,111105497.0,20th Century Studios,100.0


In [None]:
# Store the df as CSV file

movies_cleaned.to_csv('Resources/movies_cleaned.csv', index=False)