# Movie Data Analysis with PySpark

## Setup and Imports

In [2]:
# Add the parent directory to sys.path to import from src
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import necessary libraries
import json
from dotenv import load_dotenv
from pyspark.sql import functions as F

# Import project modules
from src.data_extraction import initialize_spark, fetch_all_movies, create_spark_dataframe

# Load environment variables
load_dotenv()

# Access API token from environment variable
API_ACCESS_TOKEN = os.getenv('API_ACCESS_TOKEN')
BASE_URL = "https://api.themoviedb.org/3/movie"

print("Setup complete")


Setup complete


## Initialize Spark Session

In [3]:
# Initialize Spark session
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = initialize_spark(app_name="Movie Data Analysis")
print(f"PySpark version: {spark.version}")



2025-04-24 05:18:08,535 - src.data_extraction - INFO - PySpark session initialized: version 3.5.5


PySpark version: 3.5.5


## Step 1: Fetch Movie Data from API

In [4]:
# List of movie IDs to fetch (as specified in the project requirements)
movie_ids = [0, 299534, 19995, 140607, 299536, 597, 135397, 420818, 24428,
             168259, 99861, 284054, 12445, 181808, 330457, 351286, 109445,
             321612, 260513]

# Fetch movie data from API
all_movies_data = fetch_all_movies(movie_ids, BASE_URL, API_ACCESS_TOKEN)

# Save raw data to JSON file for backup
with open("../data/movies_raw.json", "w") as json_file:
    json.dump(all_movies_data, json_file, indent=4)
print(f"Raw data saved to file, fetched {len(all_movies_data)} movies")

# Convert movie data to PySpark DataFrame
movies_df = create_spark_dataframe(spark, all_movies_data)

# Display schema to understand the data structure
movies_df.printSchema()

# Display a sample of the data
movies_df.select("id", "title", "release_date", "budget", "revenue").show(5)



2025-04-24 05:18:17,059 - src.data_extraction - INFO - Fetching data for 19 movies
2025-04-24 05:18:17,784 - src.data_extraction - ERROR - Error 404 for movie_id=0
2025-04-24 05:18:18,623 - src.data_extraction - INFO - Successfully fetched data for movie_id=299534
2025-04-24 05:18:19,612 - src.data_extraction - INFO - Successfully fetched data for movie_id=19995
2025-04-24 05:18:20,568 - src.data_extraction - INFO - Successfully fetched data for movie_id=140607
2025-04-24 05:18:21,512 - src.data_extraction - INFO - Successfully fetched data for movie_id=299536
2025-04-24 05:18:22,422 - src.data_extraction - INFO - Successfully fetched data for movie_id=597
2025-04-24 05:18:23,338 - src.data_extraction - INFO - Successfully fetched data for movie_id=135397
2025-04-24 05:18:24,144 - src.data_extraction - INFO - Successfully fetched data for movie_id=420818
2025-04-24 05:18:25,231 - src.data_extraction - INFO - Successfully fetched data for movie_id=24428
2025-04-24 05:18:26,204 - src.dat

Raw data saved to file, fetched 18 movies


2025-04-24 05:18:53,830 - src.data_extraction - INFO - Created DataFrame with 18 movies and 27 columns


root
 |-- adult: boolean (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- belongs_to_collection: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)
 |-- budget: long (nullable = true)
 |-- credits: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: map (containsNull = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: boolean (valueContainsNull = true)
 |-- genres: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: long (valueContainsNull = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- origin_country: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 

In [None]:
# Cache the DataFrame for better performance in subsequent operations
movies_df.cache()

# Save DataFrame in Parquet format (efficient for later loading)
movies_df.write.mode("overwrite").parquet("../data/movies_raw.parquet")
print("DataFrame saved to Parquet file")
