## Retrieve data using Spark

In [48]:
#Dependencies
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [49]:
#Importing packeges to retrive data from  spark
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pathlib import Path
import time
import findspark
findspark.init()

In [50]:
# Initialize Spark session
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
#Define Path 
data0 = Path('Resources/movies_dataset_0.csv').resolve()
data1 = Path('Resources/movies_dataset_1.csv').resolve()
data2 = Path('Resources/movies_dataset_2.csv').resolve()
data3 = Path('Resources/movies_dataset_3.csv').resolve()

# Read the CSV file using the absolute path
df0 = spark.read.csv(
    str(data0), 
    sep=",", 
    header=True, 
    inferSchema=True, 
    quote='"',    # Handles quoted strings
    escape='"',   # Escape character for embedded quotes
    multiLine=True  # Handles multiline fields in case of long text
)

df1 = spark.read.csv(
    str(data1), 
    sep=",", 
    header=True, 
    inferSchema=True, 
    quote='"',    
    escape='"',   
    multiLine=True  
)
df2 = spark.read.csv(
    str(data2), 
    sep=",", 
    header=True, 
    inferSchema=True, 
    quote='"',    
    escape='"',   
    multiLine=True  
)
df3 = spark.read.csv(
    str(data3), 
    sep=",", 
    header=True, 
    inferSchema=True, 
    quote='"',    
    escape='"',   
    multiLine=True  
)
# #Merge the Dataframes using union
merged_df = df0.union(df1).union(df2).union(df3)
merged_df.show()

+----------+--------------------+-------+--------------------+---------+----------+----------+-----------+----------------------+-----------------+---------+----------+-----------------+-------------------+--------------------+--------------------+---------+----------+----------+------------------+--------------------+------------+-------+
|   imdb_id|               title|runtime|            overview|    rated|imdb_votes|popularity|imdb_rating|rotten_tomatoes_rating|metacritic_rating|   budget|   revenue|financial_success|             star_1|              star_2|              star_3|  genre_1|   genre_2|   genre_3|        director_1|          director_2|release_year|outcome|
+----------+--------------------+-------+--------------------+---------+----------+----------+-----------+----------------------+-----------------+---------+----------+-----------------+-------------------+--------------------+--------------------+---------+----------+----------+------------------+-----------------

## Clean, normalize, and standardize data before modeling

In [51]:
# Converting merged df from Spark into a Pandas DataFrame
movies_df = merged_df.toPandas()
movies_df.head()

Unnamed: 0,imdb_id,title,runtime,overview,rated,imdb_votes,popularity,imdb_rating,rotten_tomatoes_rating,metacritic_rating,...,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2,release_year,outcome
0,tt0094721,Beetlejuice,92 min,A newly dead New England couple seeks help fro...,PG,348874,941.557,7.5,86.0,70.0,...,Alec Baldwin,Geena Davis,Michael Keaton,Comedy,Fantasy,,Tim Burton,,1988,True
1,tt27682129,Prey,100 min,A young couple is compelled to leave their Chr...,R,233550,436.919,7.1,94.0,71.0,...,Amber Midthunder,Dakota Beavers,Dane DiLiegro,Action,Adventure,Horror,Dan Trachtenberg,,2024,False
2,tt0295701,xXx,124 min,Xander Cage is your standard adrenaline junkie...,PG-13,187525,369.083,5.8,48.0,48.0,...,Vin Diesel,Asia Argento,Marton Csokas,Action,Adventure,Thriller,Rob Cohen,,2002,False
3,tt4154756,Avengers: Infinity War,149 min,As the Avengers and their allies have continue...,PG-13,1226533,270.163,8.4,85.0,68.0,...,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,Action,Adventure,Sci-Fi,Anthony Russo,Joe Russo,2018,True
4,tt23778968,Thelma,116 min,When 93-year-old Thelma Post gets duped by a p...,Not Rated,37145,263.208,7.0,92.0,74.0,...,Eili Harboe,Kaya Wilkins,Henrik Rafaelsen,Drama,Fantasy,Horror,Joachim Trier,,2024,True


In [52]:
#Checking columns
movies_df.columns

Index(['imdb_id', 'title', 'runtime', 'overview', 'rated', 'imdb_votes',
       'popularity', 'imdb_rating', 'rotten_tomatoes_rating',
       'metacritic_rating', 'budget', 'revenue', 'financial_success', 'star_1',
       'star_2', 'star_3', 'genre_1', 'genre_2', 'genre_3', 'director_1',
       'director_2', 'release_year', 'outcome'],
      dtype='object')

In [53]:
#Dropping unncesary columns for the model
movies_df.drop([
    'imdb_id','title', 'overview','imdb_votes','popularity',
    'imdb_rating', 'rotten_tomatoes_rating', 'metacritic_rating',
      'revenue','financial_success',
      
     ], axis=1, inplace=True)

In [54]:
#Checking data types
movies_df.dtypes

runtime         object
rated           object
budget           int32
star_1          object
star_2          object
star_3          object
genre_1         object
genre_2         object
genre_3         object
director_1      object
director_2      object
release_year     int32
outcome           bool
dtype: object

In [55]:
#Organizing columns names and coverting the 'outcome' column into a boolean 
movies_df = movies_df[['release_year','runtime', 'rated','budget', 'star_1', 'star_2', 'star_3', 'genre_1','genre_2','genre_3', 'director_1','director_2','outcome']]
movies_df['outcome'] = movies_df['outcome'].astype(int)
movies_df.head(3)

Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2,outcome
0,1988,92 min,PG,15000000,Alec Baldwin,Geena Davis,Michael Keaton,Comedy,Fantasy,,Tim Burton,,1
1,2024,100 min,R,0,Amber Midthunder,Dakota Beavers,Dane DiLiegro,Action,Adventure,Horror,Dan Trachtenberg,,0
2,2002,124 min,PG-13,70000000,Vin Diesel,Asia Argento,Marton Csokas,Action,Adventure,Thriller,Rob Cohen,,0


In [56]:
#Extracting the string(min) from the 'runtime' column
def runtime_cleaner (string):
    minutes = string.split(" ")[0]
    return int(minutes)

movies_df['runtime'] = movies_df['runtime'].map(runtime_cleaner)
movies_df.head(3)

Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2,outcome
0,1988,92,PG,15000000,Alec Baldwin,Geena Davis,Michael Keaton,Comedy,Fantasy,,Tim Burton,,1
1,2024,100,R,0,Amber Midthunder,Dakota Beavers,Dane DiLiegro,Action,Adventure,Horror,Dan Trachtenberg,,0
2,2002,124,PG-13,70000000,Vin Diesel,Asia Argento,Marton Csokas,Action,Adventure,Thriller,Rob Cohen,,0


In [57]:
# Identify categorical columns
movies_categorical = movies_df.select_dtypes(include=["object"]).columns.tolist()

In [58]:
#Checking my categorical columns
movies_categorical

['rated',
 'star_1',
 'star_2',
 'star_3',
 'genre_1',
 'genre_2',
 'genre_3',
 'director_1',
 'director_2']

In [59]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse_output=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(movies_df[movies_categorical]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names_out(movies_categorical)

# Display the first few rows of the encoded DataFrame
encode_df.head()


Unnamed: 0,rated_Approved,rated_G,rated_GP,rated_N/A,rated_NC-17,rated_Not Rated,rated_PG,rated_PG-13,rated_Passed,rated_R,...,director_2_ Tyler Gillett,director_2_ Vicky Jenson,"director_2_ Vicky Jenson, Rob Letterman",director_2_ Walt Dohrn,"director_2_ Warren Coleman, Judy Morris","director_2_ Wilfred Jackson, Hamilton Luske",director_2_ Will Merrick,director_2_ Will Speck,"director_2_ Wolfgang Reitherman, Art Stevens",director_2_None
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [60]:
# Merge one-hot encoded features and drop the originals
movies_df = movies_df.merge(encode_df,left_index=True, right_index=True)
movies_df = movies_df.drop(movies_categorical, axis=1)
movies_df.head()

Unnamed: 0,release_year,runtime,budget,outcome,rated_Approved,rated_G,rated_GP,rated_N/A,rated_NC-17,rated_Not Rated,...,director_2_ Tyler Gillett,director_2_ Vicky Jenson,"director_2_ Vicky Jenson, Rob Letterman",director_2_ Walt Dohrn,"director_2_ Warren Coleman, Judy Morris","director_2_ Wilfred Jackson, Hamilton Luske",director_2_ Will Merrick,director_2_ Will Speck,"director_2_ Wolfgang Reitherman, Art Stevens",director_2_None
0,1988,92,15000000,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2024,100,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2002,124,70000000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2018,149,300000000,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2024,116,5000000,1,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [65]:
column_list = movies_df.columns.tolist()
column_list[0:5]

['release_year', 'runtime', 'budget', 'outcome', 'rated_Approved']

In [66]:
# Split our preprocessed data into our features and target arrays
y = movies_df["outcome"].values
X = movies_df.drop(["outcome"], axis=1).values

## Train and Split Data

In [67]:
#Split the processed data from the movies_df
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
numerical_cols

Index(['runtime', 'outcome', 'rated_Approved', 'rated_G', 'rated_GP',
       'rated_N/A', 'rated_NC-17', 'rated_Not Rated', 'rated_PG',
       'rated_PG-13',
       ...
       'director_2_ Tyler Gillett', 'director_2_ Vicky Jenson',
       'director_2_ Vicky Jenson, Rob Letterman', 'director_2_ Walt Dohrn',
       'director_2_ Warren Coleman, Judy Morris',
       'director_2_ Wilfred Jackson, Hamilton Luske',
       'director_2_ Will Merrick', 'director_2_ Will Speck',
       'director_2_ Wolfgang Reitherman, Art Stevens', 'director_2_None'],
      dtype='object', length=4839)

In [19]:
# Split our preprocessed data into our features and target arrays
y = movies_df["outcome"].values
X = movies_df.drop(["outcome"], axis=1).values

## ------------


In [5]:
df.head()

Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
0,2020,116 min,R,65000000,Chris Hemsworth,Bryon Lerum,Ryder Lerum,Action,Crime,Thriller,Sam Hargrave,
1,2018,124 min,R,19800000,Sandra Bullock,Trevante Rhodes,John Malkovich,Horror,Mystery,Sci-Fi,Susanne Bier,
2,2004,146 min,R,70000000,Denzel Washington,Christopher Walken,Dakota Fanning,Action,Crime,Drama,Tony Scott,
3,2016,120 min,PG-13,110000000,Shailene Woodley,Theo James,Jeff Daniels,Action,Adventure,Mystery,Robert Schwentke,
4,1987,103 min,R,28000000,Eddie Murphy,Judge Reinhold,Jürgen Prochnow,Action,Comedy,Crime,Tony Scott,


In [7]:
df['rated'].unique()

array(['R', 'PG-13', 'TV-14', 'G', 'PG', 'Not Rated', 'Approved', 'TV-MA',
       'NC-17', 'Unrated', nan], dtype=object)

In [8]:
rated_list = df['rated'].unique().tolist()

for i, rating in enumerate(rated_list):
    df['rated'].replace(rating, int(i), inplace=True)


In [9]:
for i, rating in enumerate(rated_list):
    df['rated'].replace(rating, int(i), inplace=True)

df.head()    

Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
0,2020,116,0.0,65000000,Chris Hemsworth,Bryon Lerum,Ryder Lerum,Action,Crime,Thriller,Sam Hargrave,
1,2018,124,0.0,19800000,Sandra Bullock,Trevante Rhodes,John Malkovich,Horror,Mystery,Sci-Fi,Susanne Bier,
2,2004,146,0.0,70000000,Denzel Washington,Christopher Walken,Dakota Fanning,Action,Crime,Drama,Tony Scott,
3,2016,120,1.0,110000000,Shailene Woodley,Theo James,Jeff Daniels,Action,Adventure,Mystery,Robert Schwentke,
4,1987,103,0.0,28000000,Eddie Murphy,Judge Reinhold,Jürgen Prochnow,Action,Comedy,Crime,Tony Scott,


In [10]:
stars_list = [*df["star_1"].tolist(),*df["star_2"].tolist(),*df["star_3"].tolist()]

unique_stars_list = []

for star in stars_list:
    if star not in unique_stars_list:
        unique_stars_list.append(star)
    
print(len(unique_stars_list))

for i, star in enumerate(unique_stars_list):
    df['star_1'].replace(star, int(i), inplace=True)
    df['star_2'].replace(star, int(i), inplace=True)
    df['star_3'].replace(star, int(i), inplace=True)

df.head()  


1623


Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
0,2020,116,0.0,65000000,0,474,1091,Action,Crime,Thriller,Sam Hargrave,
1,2018,124,0.0,19800000,1,475,982,Horror,Mystery,Sci-Fi,Susanne Bier,
2,2004,146,0.0,70000000,2,476,1092,Action,Crime,Drama,Tony Scott,
3,2016,120,1.0,110000000,3,477,531,Action,Adventure,Mystery,Robert Schwentke,
4,1987,103,0.0,28000000,4,478,1093,Action,Comedy,Crime,Tony Scott,


In [11]:
genre_list = [*df["genre_1"].tolist(),*df["genre_2"].tolist(),*df["genre_3"]]

unique_genre_list = []

for genre in genre_list:
    if genre not in unique_genre_list:
        unique_genre_list.append(genre)
    
print(len(unique_genre_list))

for i, genre in enumerate(unique_genre_list):
    df['genre_1'].replace(genre, int(i), inplace=True)
    df['genre_2'].replace(genre, int(i), inplace=True)
    df['genre_3'].replace(genre, int(i), inplace=True)

df.head()    

32


Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
0,2020,116,0.0,65000000,0,474,1091,0,12,24,Sam Hargrave,
1,2018,124,0.0,19800000,1,475,982,1,13,20,Susanne Bier,
2,2004,146,0.0,70000000,2,476,1092,0,12,18,Tony Scott,
3,2016,120,1.0,110000000,3,477,531,0,14,13,Robert Schwentke,
4,1987,103,0.0,28000000,4,478,1093,0,15,12,Tony Scott,


In [12]:
director_list = [*df["director_1"].tolist(),*df["director_2"].tolist()]

unique_director_list = []

for director in director_list:
    if director not in unique_director_list:
        unique_director_list.append(director)
    
print(len(unique_director_list))

for i, director in enumerate(unique_director_list):
    df['director_1'].replace(director, int(i), inplace=True)
    df['director_2'].replace(director, int(i), inplace=True)

df.head()    

594


Unnamed: 0,release_year,runtime,rated,budget,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
0,2020,116,0.0,65000000,0,474,1091,0,12,24,0,527
1,2018,124,0.0,19800000,1,475,982,1,13,20,1,527
2,2004,146,0.0,70000000,2,476,1092,0,12,18,2,527
3,2016,120,1.0,110000000,3,477,531,0,14,13,3,527
4,1987,103,0.0,28000000,4,478,1093,0,15,12,2,527


## Separate the Features(X) from the Target (y)


 ## Split our data into training and testing

 ## Create a Logistic Regression Model

 ## Fit (train) or model using the training data

 ## Score the model using the test data

 ## Make predictions

## Calculate the Accuracy Score