#Install Packages

In [0]:
pip install lxml html5lib beautifulsoup4

#Initialize URLs

In [0]:
DATASET_URL = "https://datasets.imdbws.com/"
BOXOFFICE_URL = "https://www.boxofficemojo.com"
LANGUAGE_URL = "https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes"
IMDB_URL = "https://www.imdb.com/"
CHART_URL = f"{IMDB_URL}chart/"
INDIA_URL = f"{IMDB_URL}india/"
IMDB_SEARCH_URL = f"{IMDB_URL}search/title/"
TOP_1000_URL = f"{IMDB_SEARCH_URL}?groups=top_1000"
LANG_URL = f"{IMDB_SEARCH_URL}?primary_language"
BOXOFFICE_CHART = f"{BOXOFFICE_URL}/chart/ww_top_lifetime_gross/?area=XWW&offset="
BOXOFFICE_YEAR = f"{BOXOFFICE_URL}/year/world/"
HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
    }

#Initialize variables

In [0]:
basePath = "/user/IMDB/"
rawPath = f"{basePath}raw/"
silverPath = f"{basePath}silver/"
goldPath = f"{basePath}gold/"
topTableName = "t_imdb_top"
boTableName = "t_bo"
goldTableName = "t_imdb"
goldTablePath = goldPath + goldTableName
file_list = ['name.basics.tsv.gz', 'title.basics.tsv.gz', 'title.crew.tsv.gz', 'title.principals.tsv.gz', 'title.ratings.tsv.gz']
RawFolderList= [file[:-7].replace("-co","") for file in file_list]
SilverTableList = ["t_"+folderName.replace(".","_").replace("-co","") for folderName in RawFolderList]
FullTableList = SilverTableList + [goldTableName] + [topTableName] + [boTableName]

# Import Required Functions

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pyspark.pandas as ps
import requests
from bs4 import BeautifulSoup
from datetime import date
from urllib.request import urlretrieve
import re
import concurrent.futures

#Drop all tables/folders if exists

In [0]:
dbutils.fs.rm(f"dbfs:{basePath}", recurse = True)
for tbl in FullTableList:
  spark.sql(f"DROP TABLE IF EXISTS {tbl}")

# Create Schema

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS silver location '{silverPath}'");
spark.sql(f"CREATE SCHEMA IF NOT EXISTS gold location '{goldPath}'");

# Extract Language Codes and names from Wiki

In [0]:
lang_df = ps.read_html(LANGUAGE_URL)[0][["ISO language name","Set 1"]]
lang_df.columns = ['lang_name','lang_code']
lang_df = lang_df.to_spark()

# Function to Scape Movie Data

In [0]:
def scrape_movie_data(url):
    """
    Scrapes movie data from the given URL and returns a movie_dict.

    Args:
        url (str): The URL to scrape.

    Returns:
        dict: A dictionary containing movie details.
    """

    # Initialize an empty list for movie data
    movie_data = []

    # Scrape data from the given URL
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")

    # Determine the class value based on the presence of INDIA_URL in the URL
    is_box_office_url = BOXOFFICE_URL in url
    is_search_url = IMDB_SEARCH_URL in url
    is_india_url = INDIA_URL in url
    is_yearly_box_office = BOXOFFICE_YEAR in url
    class_value = 'ipc-metadata-list-item__icon-link' if is_india_url else 'ipc-title-link-wrapper'

    # Find all movie entries
    movie_entries = soup.find_all('table')[0].find_all("tr") if is_box_office_url else soup.find_all('a', class_=class_value, href=True)

    # Extract movie details
    for entry in movie_entries:
        if is_box_office_url:
            row_cols = entry.find_all('td')
            if row_cols:
              box_office = int(row_cols[2].text.replace("$", "").replace(",", ""))
              if is_yearly_box_office:
                movie_name = row_cols[1].text
                movie_year = int(url.split('/')[-1])
                movie_dict = {
                    "movie_name": movie_name,
                    "movie_year": movie_year,
                    "box_office_usd": box_office
                }
                movie_data.append(movie_dict)
              else:
                movie_type = url.split('/')[-2]
                movie_rank = int(row_cols[0].text.replace(",", ""))
                tit = row_cols[1].find("a")['href']
                movie_id = re.search(r"(tt\d+)", tit).group(1)
                movie_dict = {
                    "tconst": movie_id,
                    "rnk": movie_rank,
                    "type": movie_type,
                    "rating": -0.1, # Placeholder value
                    "votes": -1, # Placeholder value
                    "boxoffice": box_office
                }
                movie_data.append(movie_dict)
        else:
            link = entry.attrs.get('href')
            if link and link.startswith('/title/tt'):
                movie_name = entry.attrs.get('aria-label') if is_india_url else entry.find('h3', class_='ipc-title__text').text.strip()
                movie_id = re.search(r"(tt\d+)", link).group(1)
                movie_rank = int(re.search(r"^(\d+)\.", movie_name).group(1))
                movie_type = url.split('/')[-1] if is_search_url else url.split('/')[-2]
                movie_dict = {
                    "tconst": movie_id,
                    "rnk": movie_rank,
                    "type": movie_type,
                    "rating": -0.1, # Placeholder value
                    "votes": -1, # Placeholder value
                    "boxoffice": -1 # Placeholder value
                }
                movie_data.append(movie_dict)

    return movie_data

# Function for concurrent scrapping

In [0]:
def scrape_movie_data_concurrently(urls, max_workers=5):
    movie_data = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks for each URL
        future_to_url = {executor.submit(scrape_movie_data, url): url for url in urls}
        # Wait for all tasks to complete
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                movie_data.extend(data)
            except Exception as e:
                print(f"Error scraping data from {url}: {e}")
    return movie_data

#Extract all Top Rated Movies

In [0]:
# Define the list of URLs to scrape
url_list = [
    f"{CHART_URL}{suffix}" for suffix in ["toptv/", "top/"]
] + [
    f"{INDIA_URL}top-rated-{language}-movies/" for language in ["indian", "malayalam", "tamil", "telugu"]
]

#Extract Top 1000 Movies + Popular Language-wise Movies list

In [0]:
# Define the additional URLs to append
top_search_urls = [TOP_1000_URL, f"{IMDB_SEARCH_URL}?release_date=,9999-12-31"] + [f"{LANG_URL}={language}" for language in ["ml", "ta", "hi", "te", "kn"]]
sort_syntax = ["moviemeter,asc","num_votes,desc"]
rng = range(1,1000,50) 

# Append the additional URLs to the url_list using list comprehension
url_list += [
    f'{top_url}&sort={sort}&start={n}'
    for top_url in top_search_urls
    for sort in sort_syntax
    if not (('top_1000' in top_url or 'release_date' in top_url) and 'desc' in sort)
    for n in rng
]

# Extract All Time Boxoffice Details

In [0]:
rng = range(0,1000,200)
url_list += [
  BOXOFFICE_CHART + str(n)
  for n in rng
]

# Scrape Movie Data

In [0]:
movie_data = scrape_movie_data_concurrently(url_list)

# Create Dataframe and build t_imdb_top table

In [0]:
imdb_df = spark.createDataFrame(movie_data)

imdb_df = (
  imdb_df
  .filter(~((F.col("type").like("%top_1000%")) & (F.col("type").like("%desc%"))))
  .withColumn("lang_code", F.when(F.col("type").like("%primary_language%,%"), F.substring(F.substring_index(F.col("type"), '=', -1),1,2))
                            .when(F.col("type").like("%malayalam%"), F.lit('ml'))
                            .when(F.col("type").like("%telugu%"), F.lit('te'))
                            .when(F.col("type").like("%tamil%"), F.lit('ta')))
)

imdb_df = imdb_df.join(lang_df, "lang_code", 'leftouter')

imdb_df = (
  imdb_df
  .select("tconst",
          "lang_name",
          F.col("rnk").cast("int"),
          F.col("type").alias("url"),
          F.when(F.col("rating") >= 0, F.col("rating")).cast("float").alias("rating"),
          F.when(F.col("votes") >= 0, F.col("votes")).cast("int").alias("votes"),
          F.when(F.col("boxoffice") >= 0, F.col("boxoffice")).cast("int").alias("box_office"),
          F.when(F.col("type").isin(['toptv','top','top-rated-indian-movies','top-rated-malayalam-movies','top-rated-tamil-movies','top-rated-telugu-movies']), F.lit('Y')).otherwise(F.lit('N')).alias("is_in_top_250"),
          F.when(F.col("type").like("%top_1000%"), F.lit('Y')).otherwise(F.lit('N')).alias("is_in_top_1000"),
          F.when(F.col("type").like("%release_date%moviemeter%"), F.lit('Y')).otherwise(F.lit('N')).alias("is_popular"),
          F.when(F.col("type").like("%primary_language%,%"), F.lit('Y')).otherwise(F.lit('N')).alias("is_primary_lang"),
          F.when(F.substring("type",-3,3) == 'asc', F.lit('Y')).otherwise(F.lit('N')).alias("is_asc"),
          F.when(F.substring("type",-3,3) == 'esc', F.lit('Y')).otherwise(F.lit('N')).alias("is_desc"))
)

(
  imdb_df
  .write
  .format("delta")
  .mode("overwrite")
  .option("overwriteSchema","true")
  .saveAsTable(f"silver.{topTableName}")
)

# Extract Yearly Box Office Details

In [0]:
current_year = date.today().year
rng = range(1977,current_year+1,1)
yearly_box_office_url_list =  [
  BOXOFFICE_YEAR + str(n)
  for n in rng
]

# Scrape yearly box office

In [0]:
yearly_box_office_data = scrape_movie_data_concurrently(yearly_box_office_url_list)

# Create Dataframe and build t_bo table

In [0]:
bo_df = spark.createDataFrame(yearly_box_office_data)

(
  bo_df
 .select("movie_name",
        F.col("movie_year").cast("string"),
        F.col("box_office_usd").cast("int"))
 .write
 .format("delta")
 .mode("overwrite")
 .option("overwriteSchema","true")
 .saveAsTable(f"silver.{boTableName}")
)

#Download the datasets to driver and move to Raw storage folders

In [0]:
for file in file_list:
  tablename = file[:-7].replace("-co","")
  extn = file[-3:]
  BaseURL = DATASET_URL + file
  DriverPath = f"file:/databricks/driver/{file}"
  dbfsPath = f"dbfs:{rawPath}{tablename}/{file}"
  urlretrieve(BaseURL,file)
  dbutils.fs.mv(DriverPath, dbfsPath)

#Create Silver Delta tables

## Create function for reading from raw and writing to silver

In [0]:
def load_table(rawFolderName):
  rawFilePath = rawPath + rawFolderName
  silverTableName = "t_"+rawFolderName.replace(".","_")
  silverSavePath = silverPath + silverTableName
  delim = "\t"
  df = (
    spark
    .read
    .format("csv")
    .option("inferSchema", "false")
    .option("header","true")
    .option("delimiter",delim)
    .load(rawFilePath)
  )
  colToChange = {'averageRating':'float', 'numVotes':'int', 'startYear':'int', 'runtimeMinutes':'int'}
  dfColToChange= {k:v for (k,v) in colToChange.items() if k in df.columns}
  for colName, dataType in dfColToChange.items():
    df= df.withColumn(colName, F.expr(f"try_cast({colName} as {dataType})"))
  (
    df
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema","true")
    .saveAsTable(f"silver.{silverTableName}")
  )

## Execute the load_table function for multiple tables in parallel

In [0]:
from threading import Thread
from queue import Queue

q = Queue()

worker_count = 5 # Number of tables which will be loaded in parallel

def run_tasks(function, q):
    while not q.empty():
        value = q.get()
        function(value)
        q.task_done()

for rawFolderName in RawFolderList:
    q.put(rawFolderName)

for i in range(worker_count):
    t=Thread(target=run_tasks, args=(load_table, q))
    t.daemon = True
    t.start()

q.join()

#Create Gold Table (PySpark approach)

In [0]:
# Declare Dataframes
tbs = spark.table('silver.t_title_basics').alias("tbs")
trt = spark.table('silver.t_title_ratings').alias("trt")
tnb = spark.table('silver.t_name_basics').alias("tnb")
imd = spark.table('silver.t_imdb_top').alias("imd")
tps = spark.table('silver.t_title_principals').alias("tps")
tcr = spark.table('silver.t_title_crew').alias("tcr")
tbo = spark.table('silver.t_bo').alias("tbo")

# Get Crew informations
tps = (
     tps
     .filter(F.col("category").isin(['actor', 'actress']))
     .select("tconst", 
             "category", 
             "nconst")
)
tcr = (
     tcr
     .filter(~F.col("directors").like("_N"))
     .select("tconst", 
             F.lit("director").alias("category"), 
             F.explode(F.split(F.col("directors"), ',')).alias("nconst"))
)
tps = tps.union(tcr)

# Get Box office informations
tbo = (
     tbo
     .join(tbs,(tbo.movie_name == tbs.primaryTitle) & (tbo.movie_year == tbs.startYear) & (tbs.titleType == F.lit("movie")), 'inner')
     .join(trt, trt.tconst == tbs.tconst, 'inner')
)
tbo = tbo.withColumn("rnk", F.row_number().over(Window
                                               .partitionBy("tbs.primaryTitle","tbs.startYear")
                                               .orderBy(F.col("trt.numvotes").desc())))
tbo = (
     tbo
     .filter(F.col("rnk") == 1)
     .select(F.col("tbs.tconst").alias("tconst"), "tbo.box_office_usd")
)

# Final dataframe which is one row per title
tbs = tbs.filter(F.col("tbs.titletype").isin(['movie','tvMiniSeries','short','tvSeries','tvShort','tvSpecial']))
tbs = (
     tbs
     .join(trt, trt.tconst == tbs.tconst, 'leftouter')
     .join(tps, tps.tconst == tbs.tconst, 'leftouter')
     .join(tnb, tnb.nconst == tps.nconst, 'leftouter')
     .join(imd, imd.tconst == tbs.tconst, 'leftouter')
     .join(tbo, tbo.tconst == tbs.tconst, 'leftouter')
)
tbs = (
     tbs
     .groupBy("tbs.tconst")
     .agg(F.max(F.regexp_replace(F.initcap("tbs.titletype"), 'Tv', 'TV ')).alias("title_type"),
          F.max("tbs.primarytitle").alias("primary_title"),
          F.max("tbs.originaltitle").alias("original_title"),
          F.max("tbs.startyear").alias("yr"),
          F.max(F.when(F.col("tbs.isadult") == 1,"Y").otherwise("N")).alias("is_adult"),
          F.max("tbs.runtimeminutes").alias("runtime_min"),
          F.max(F.when(~F.col("tbs.genres").like("_N"), F.col("genres"))).alias("genres"),
          F.coalesce(F.max("imd.rating"), F.max("trt.averagerating")).alias("avg_rating"),
          F.coalesce(F.max("imd.votes"), F.max("trt.numvotes")).alias("num_votes"),
          F.abs(F.coalesce(F.max("imd.box_office"), F.max("tbo.box_office_usd"))).alias("box_office"),
          F.max(F.when(F.col("imd.is_in_top_250") == 'Y', F.col("imd.rnk"))).alias("top_250_rnk"),
          F.max(F.when(F.col("imd.is_in_top_1000") == 'Y', F.col("imd.rnk"))).alias("top_1000_rnk"),
          F.max(F.when(F.col("imd.is_popular") == 'Y', F.col("imd.rnk"))).alias("popularity_rnk"),
          F.max(F.when((F.col("imd.is_primary_lang") == 'Y') & (F.col("imd.is_asc") == 'Y'), F.col("rnk"))).alias("language_popularity_rnk"),
          F.max(F.when((F.col("imd.is_primary_lang") == 'Y') & (F.col("imd.is_desc") == 'Y'), F.col("rnk"))).alias("language_votes_rnk"),
          F.coalesce(F.max("imd.is_in_top_1000"), F.lit('N')).alias("is_top_1000_movies"),
          F.concat_ws('; ', F.collect_set("imd.lang_name")).alias("language_lst"),
          F.concat_ws('; ', F.collect_set(F.when(F.col("tps.category") == 'director', F.col("tnb.primaryname")))).alias("director_lst"),
          F.concat_ws('; ', F.collect_set(F.when(F.col("tps.category") == 'actor', F.col("tnb.primaryname")))).alias("actor_lst"),
          F.concat_ws('; ', F.collect_set(F.when(F.col("tps.category") == 'actress', F.col("tnb.primaryname")))).alias("actress_lst"),
          F.lit(F.current_date()).alias("last_refresh_date"))
)
tbs = (
     tbs
     .select([
              F.when(F.col(c) == "", None).otherwise(F.col(c)).alias(c)
              if c in ["language_lst", "director_lst", "actor_lst", "actress_lst"]
              else F.col(c)
              for c in tbs.columns
              ])
)
(
 tbs
 .write
 .format("delta")
 .mode("overwrite")
 .option("overwriteSchema","true")
 .saveAsTable(f"gold.{goldTableName}")
)

# Create Gold Table (SQL approach)
#### The code below is in markdown form to avoid gold table being created twice. To execute it, change it from markdown to sql.

CREATE
OR REPLACE TABLE gold.t_imdb AS 
with tps AS (
  SELECT
    tconst,
    nconst,
    category
  FROM
    silver.t_title_principals
  WHERE
    category IN ('actor', 'actress')
  UNION ALL
  SELECT
    tconst,
    explode(split(directors, ',')) AS ncost,
    'director' AS category
  FROM
    silver.t_title_crew
  WHERE
    directors NOT LIKE '_N'
),
tbo AS (
  SELECT
    tbs.tconst,
    bo.box_office_usd,
    row_number() over(
      partition BY tbs.primarytitle,
      tbs.startyear
      ORDER BY
        trt.numvotes DESC nulls last
    ) AS rnk
  FROM
    silver.t_bo bo
    INNER JOIN silver.t_title_basics tbs ON (
      tbs.primarytitle = bo.movie_name
      AND tbs.startyear = bo.movie_year
      AND tbs.titletype = 'movie'
    )
    INNER JOIN silver.t_title_ratings trt ON (trt.tconst = tbs.tconst)
)
SELECT
  tbs.tconst,
  MAX(tbs.titletype) AS title_type,
  MAX(tbs.primarytitle) AS primary_title,
  MAX(tbs.originaltitle) AS original_title,
  MAX(tbs.startyear) AS yr,
  MAX(
    CASE
      WHEN tbs.isadult = '1' THEN 'Y'
      ELSE 'N'
    end
  ) AS is_adult,
  MAX(tbs.runtimeminutes) AS runtime_min,
  MAX(
    CASE
      WHEN tbs.genres NOT LIKE '_N' THEN tbs.genres
    end
  ) AS genres,
  coalesce(
    MAX(imd.rating),
    MAX(trt.averagerating)
  ) AS avg_rating,
  coalesce(
    MAX(imd.votes),
    MAX(trt.numvotes)
  ) AS num_votes,
  coalesce(
    MAX(imd.box_office),
    MAX(tbo.box_office_usd)
  ) AS box_office,
  MAX(
    CASE
      WHEN imd.is_in_top_250 = 'Y' THEN imd.rnk
    end
  ) AS top_250_rnk,
  MAX(
    CASE
      WHEN imd.is_in_top_1000 = 'Y' THEN imd.rnk
    end
  ) AS top_1000_rnk,
  MAX(
    CASE
      WHEN imd.is_popular = 'Y' THEN imd.rnk
    end
  ) AS popularity_rnk,
  MAX(
    CASE
      WHEN imd.is_primary_lang = 'Y'
      AND imd.is_asc = 'Y' THEN imd.rnk
    end
  ) AS language_popularity_rnk,
  MAX(
    CASE
      WHEN imd.is_primary_lang = 'Y'
      AND imd.is_desc = 'Y' THEN imd.rnk
    end
  ) AS language_votes_rnk,
  coalesce(MAX(imd.is_in_top_1000), 'N') AS is_top_1000_movies,
  NULLIF(concat_ws('; ', collect_set(imd.lang_name)), '') AS language_lst,
  NULLIF(
    concat_ws(
      '; ',
      collect_set(
        CASE
          WHEN tps.category = 'actor' THEN tnb.primaryname
        end
      )
    ),
    ''
  ) AS actor_lst,
  NULLIF(
    concat_ws(
      '; ',
      collect_set(
        CASE
          WHEN tps.category = 'actress' THEN tnb.primaryname
        end
      )
    ),
    ''
  ) AS actress_lst,
  NULLIF(
    concat_ws(
      '; ',
      collect_set(
        CASE
          WHEN tps.category = 'director' THEN tnb.primaryname
        end
      )
    ),
    ''
  ) AS director_lst,
  current_date() AS last_refresh_date
FROM
  silver.t_title_basics tbs
  LEFT OUTER JOIN silver.t_title_ratings trt ON (trt.tconst = tbs.tconst)
  LEFT OUTER JOIN tps ON (tps.tconst = tbs.tconst)
  LEFT OUTER JOIN silver.t_name_basics tnb ON (tnb.nconst = tps.nconst)
  LEFT OUTER JOIN silver.t_imdb_top imd ON (imd.tconst = tbs.tconst)
  LEFT OUTER JOIN tbo ON (
    tbo.tconst = tbs.tconst
    AND tbo.rnk = 1
  )
WHERE
  tbs.titletype IN (
    'movie',
    'tvMiniSeries',
    'short',
    'tvSeries',
    'tvShort',
    'tvSpecial'
  )
GROUP BY
  tbs.tconst

#Create view for Reporting

In [0]:
%sql CREATE
OR REPLACE VIEW v_imdb AS
select
  tconst as `IMDB ID`,
  title_type as `Title Type`,
  primary_title as `Primary Title`,
  original_title as `Original Title`,
  yr as `Release Year`,
  is_adult as `Is Adult`,
  runtime_min as `Runtime in Min`,
  genres as `Generes`,
  top_250_rnk as `Top 250 Rank`,
  row_number() over(
    order by
      popularity_rnk asc nulls last,
      language_popularity_rnk asc nulls last,
      top_1000_rnk asc nulls last,
      language_votes_rnk asc nulls last,
      num_votes desc nulls last
  ) as `Popularity Rank`,
  is_top_1000_movies as `Is in Top 1000 Movies`,
  language_lst as `Languages`,
  avg_rating,
  num_votes,
  box_office,
  director_lst as `Directors`,
  actor_lst as `Actors`,
  actress_lst as `Actresses`,
  last_refresh_date as `Last Refresh Date`
from
  gold.t_imdb

In [0]:
%sql
select count(*) from gold.t_imdb

# Process Power BI dataset

In [0]:
%run "./Trigger Power BI Dataset" $Dataset="IMDB Full"