In [0]:
# %pip install beautifulsoup4 tqdm lxml
# dbutils.library.restartPython()

# Install all dependencies from requirements.txt
%pip install -r requirements.txt
dbutils.library.restartPython()

# Pull Data

1. Get links associated with each movie in the database
2. Pull scripts and relevant metadata
3. Print out errors for any movies that are not obtained properly

In [0]:
CATALOG = "movie_scripts" # Ex. wesley_pasfield
SCHEMA = "ad_placement_agent" #Ex. aandeworkshop
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"USE SCHEMA {SCHEMA}")

In [0]:
import re
import bs4
import json
import requests

from tqdm import tqdm

BASE_URL = "https://imsdb.com"

def get_all_links():
  """
  Fetches all movie script links from the IMSDb website.

  Returns:
  list: A list of URLs pointing to individual movie scripts.
  """
  try:
    response = requests.get(BASE_URL + '/all-scripts.html')
    response.raise_for_status()
    print(response)
    soup = bs4.BeautifulSoup(response.text, 'lxml')
    links = soup.findAll('td', {'valign': 'top'})[-1].findAll('a')
    return [BASE_URL + l.get('href') for l in links]
  except requests.RequestException as e:
    print(f"Error fetching all links: {e}")
    return []

def retrieve_script(url):
  """
  Retrieves the script text from a given URL.

  Args:
    url (str): The URL of the movie script.

  Returns:
    str: The text content of the movie script.
  """
  try:
    response = requests.get(url)
    response.raise_for_status()
    soup = bs4.BeautifulSoup(response.text, 'lxml')
    return soup.find('td', {'class': 'scrtext'}).find('pre').text
  except Exception as e:
    raise ValueError(f"Error retrieving script from {url}: {e}")

def process_link(url):
  """
  Extracts metadata and script content from a given movie script URL.

  Args:
    url (str): The URL of the movie script page.

  Returns:
    dict: A dictionary containing metadata and script content of the movie.
  """
  try:
    response = requests.get(url)
    response.raise_for_status()
    soup = bs4.BeautifulSoup(response.text, 'lxml')
    table = soup.find('table', {'class': 'script-details'})
    title = table.find('h1').text
    poster = table.find('td', {'align': 'right', 'valign': 'top'}).find('img').get('src')

    texts = table.find('b', string='IMSDb opinion').parent.text
    patterns = {
      'IMSDb_opinion': r"IMSDb opinion\s+(?P<opinion>.+?)\n",
      'IMSDb_rating': r"IMSDb rating\s+(?P<rating>.+?)\n",
      'average_user_rating': r"Average user rating\s+\((?P<rating>[\d.]+)(?: out of \d+)?",
      'writers': r"Writers\s+(?P<writers>.+?)\n",
      'genres': r"Genres\s+(?P<genres>.+?)\n",
      'script_date': r"Script Date : (?P<date>[\w\s]+?)\n",
      'movie_release_date': r"Movie Release Date : (?P<date>[\w\s]+?)\n",
      'submitted_by': r"Submitted by: (?P<submitter>\w+)\n"
    }
    d = {}
    for k, pattern in patterns.items():
      match = re.search(pattern, texts)
      if match:
        if k in ['writers', 'genres']:
          d[k] = re.split(r'\s{2,}', match.group(1))
        else:
          d[k] = match.group(1)
    d['title'] = title[:-len(' Script')]
    d['poster'] = poster
    script_url = BASE_URL + soup.find('a', href=re.compile(r'/scripts/')).get('href')
    d['script'] = retrieve_script(script_url)
    return d
  except Exception as e:
    print(f'Error!: {e}')
    return None
  
def get_poster_data(poster_url: str, movie_id: str) -> list:
    """
    Fetches image data from a URL and returns it along with the movie_id.
    This function does NOT save any files to disk.

    Args:
        url (str): The URL of the movie poster image.
        movie_id (str): A unique identifier for the movie.

    Returns:
        list: A list containing [movie_id, img_data_bytes] where
              img_data_bytes is the raw binary content of the image.
              Returns [movie_id, None] if fetching fails.
    """
    try:
        if poster_url == "/images/no-poster.gif":
          return [movie_id, None]
        # Fetch the image data directly as bytes
        img_data = requests.get(poster_url, timeout=30).content # Added a timeout for robustness
        return [movie_id, img_data]
    except requests.exceptions.RequestException as e:
        print(f"Error fetching poster for movie ID {movie_id} from {poster_url}: {e}")
        return [movie_id, None] # Return None for image data if fetching fails
    except Exception as e:
        print(f"An unexpected error occurred for movie ID {movie_id} from {poster_url}: {e}")
        return [movie_id, None]

In [0]:
# Get all links from the database

links = get_all_links()

# Pull in all metadata and scripts for each movie obtained
# Print out errors - should be 1222 that pull successfully

movie_data = []  
movie_images = []
i = 1
for link in tqdm(links, desc="Processing links"):
  movie_result = process_link(link)
  if movie_result:
    movie_result['unique_movie_id'] = i
    movie_data.append(movie_result)
    image_result = get_poster_data(poster_url = movie_result['poster'], movie_id = movie_result['unique_movie_id'])
    if image_result:
      movie_images.append(image_result)
  i= i+1

# Normalize Data

1. Create unique identifier for each movie information 
2. Flatten out genre & writers information as they are stored as arrays
3. Separate out the script into its own dataset

In [0]:
movie_data[0]

In [0]:
# Pull out all keys from the obtained database

all_keys = set().union(*(d.keys() for d in movie_data))
keys = movie_data[0].keys()

# Isolate the script data to be saved in a volume

script_volume = [{'unique_movie_id': d['unique_movie_id'], 'script': d['script']} for d in movie_data]

# Flatten out writers & genre data

flattened_writers = [{'unique_movie_id': d['unique_movie_id'], 'writer': writer} for d in movie_data for writer in d['writers']]
flattened_genres = [{'unique_movie_id': d['unique_movie_id'], 'genre': genre} for d in movie_data for genre in d['genres']]

print('Flattened writers \n')
print(flattened_writers[0:3])
print('Flattened genres \n')
print(flattened_genres[0:3])

# Drop script, writer & genre columns from original data

for d in movie_data:
  del d['writers']
  del d['genres']
  del d['script']

print('General metadata')
print(movie_data[0:3])

# Save Data

Below is the final datasets created. There are in the wesley_pasfield.AandEWorkshop catalog.schema

1. movie_metadata

```
unique_movie_id is primary key

unique_movie_id: Unique Identifier for a movie
IMSDb_opinion: Quick summary to the movie
IMSDb_rating: Summary rating
average_user_rating: Average user rating
title: Movie Title
poster: Image associated with the poster
```
2. writers

```
unique_movie_id and writer together are composite primary key
There can be multiple writers associated with each movie

unique_movie_id: Unique identifier for the movie
writer: Writer name associated with the movie
```
3. genres
```
unique_movie_id and genre together are composite primary key
There can be multiple genres associated with each movie

unique_movie_id: Unique identifier for the movie
genre: Genre associated with the movie
```
4. scripts
```
unique_movie_id is the primary key

unique_movie_id: Unique identifier for the movie
script: Script associated with the movie
```



In [0]:
from pyspark.sql.types import StructType, StructField, LongType, BinaryType

# Convert lists to DataFrames
writers_df = spark.createDataFrame(flattened_writers)
genres_df = spark.createDataFrame(flattened_genres)
movie_metadata_df = spark.createDataFrame(movie_data)
scripts_df = spark.createDataFrame(script_volume)

# Write DataFrames to tables
writers_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.writers")
genres_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.genres")
movie_metadata_df.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SCHEMA}.movie_metadata")

# Write scripts to a volume
scripts_df.write.format("delta").mode("overwrite").save(f"/Volumes/{CATALOG}/{SCHEMA}/scripts")

# Write images to a volume
image_schema = StructType([
    StructField("id", LongType(), True),
    StructField("image_data", BinaryType(), True)
])
images_df = spark.createDataFrame(movie_images, schema=image_schema)
images_df.write.format("delta").mode("overwrite").save(f"/Volumes/{CATALOG}/{SCHEMA}/movie_posters")

In [0]:
# Load data to test out

# Read DataFrames from tables
writers_df = spark.table(f"{CATALOG}.{SCHEMA}.writers")
genres_df = spark.table(f"{CATALOG}.{SCHEMA}.genres")
movie_metadata_df = spark.table(f"{CATALOG}.{SCHEMA}.movie_metadata")

# Read scripts from the volume
scripts_df = spark.read.format("delta").load(f"/Volumes/{CATALOG}/{SCHEMA}/scripts")

In [0]:
spark.conf.set("spark.sql.shuffle.partitions", "2000")