#Install Packages

In [0]:
pip install lxml html5lib beautifulsoup4

#Initialize URLs

In [0]:
DatasetURL = "https://datasets.imdbws.com/"
BoxOfficeUrl = "https://www.boxofficemojo.com"
LanguageURL = "https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes"
IMDBURL = "https://www.imdb.com/"
ChartURL = f"{IMDBURL}chart/"
IndiaURL = f"{IMDBURL}india/"
IMDBSearchURL = f"{IMDBURL}search/title/"
Top1000URL = f"{IMDBSearchURL}?groups=top_1000"
LangURL = f"{IMDBSearchURL}?primary_language"
BoxOfficeChart = f"{BoxOfficeUrl}/chart/ww_top_lifetime_gross/?area=XWW&offset="
BoxOfficeYear = f"{BoxOfficeUrl}/year/world/"

#Initialize variables

In [0]:
basePath = "/user/IMDB/"
rawPath = f"{basePath}raw/"
silverPath = f"{basePath}silver/"
goldPath = f"{basePath}gold/"
topTableName = "t_imdb_top"
boTableName = "t_bo"
goldTableName = "t_imdb"
goldTablePath = goldPath + goldTableName
FileList = ['name.basics.tsv.gz', 'title.basics.tsv.gz', 'title.crew.tsv.gz', 'title.principals.tsv.gz', 'title.ratings.tsv.gz']
RawFolderList= [file[:-7].replace("-co","") for file in FileList]
SilverTableList = ["t_"+folderName.replace(".","_").replace("-co","") for folderName in RawFolderList]
FullTableList = SilverTableList + [goldTableName] + [topTableName] + [boTableName]

# Import Required Functions

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pyspark.pandas as ps
import requests
from bs4 import BeautifulSoup
from datetime import date
from urllib.request import urlretrieve

#Drop all tables/folders if exists

In [0]:
dbutils.fs.rm(f"dbfs:{basePath}", recurse = True)
for tbl in FullTableList:
  spark.sql(f"DROP TABLE IF EXISTS {tbl}")

# Create Schema

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS silver location '{silverPath}'");
spark.sql(f"CREATE SCHEMA IF NOT EXISTS gold location '{goldPath}'");

# Extract Language Codes and names from Wiki

In [0]:
lang_df = ps.read_html(LanguageURL)[1][["ISO language name","639-1"]]
lang_df.columns = ['lang_name','lang_code']

#Extract all Top Rated Movies

In [0]:
ID = []
Type = []
Rank = []
Votes =[]
Rating = []
BO = []
start = "/title/"
end = "/"
urlList = [f'{ChartURL}toptv/', f'{ChartURL}top/', f'{IndiaURL}top-rated-indian-movies/', f'{IndiaURL}top-rated-malayalam-movies/', f'{IndiaURL}top-rated-tamil-movies/', f'{IndiaURL}top-rated-telugu-movies/']
for url in urlList:
  response = requests.get(url)
  soup = BeautifulSoup(response.text, "html.parser")
  ID = ID + [a.attrs.get('href')[a.attrs.get('href').find(start)+len(start):a.attrs.get('href').rfind(end)] for a in soup.select('td.titleColumn a')]
  Rank= Rank + [int(b.attrs.get('data-value')) for b in soup.select('td.posterColumn span[name=rk]')]
  Type = Type + [url[url.find(ChartURL)+len(ChartURL):url.rfind(end)].replace("/","") for a in soup.select('td.titleColumn a')]
  Rating = Rating + [-0.1 for a in soup.select('td.titleColumn a')]
  Votes = Votes + [-1 for a in soup.select('td.titleColumn a')]
  BO = BO + [-1 for a in soup.select('td.titleColumn a')]

#Extract Top 1000 Movies + Popular Language-wise Movies list

In [0]:
TopSearchURLs = [f"{IMDBSearchURL}?groups=top_1000", f"{IMDBSearchURL}?release_date=,9999-12-31", f"{LangURL}=ml", f"{LangURL}=ta", f"{LangURL}=hi", f"{LangURL}=te", f"{LangURL}=kn", f"{LangURL}=ko"]
SortSyntax = ["moviemeter,asc","num_votes,desc"]
rng = range(1,1000,50)
for TopUrl in TopSearchURLs:
  for Sort in SortSyntax:
    if ('top_1000' in TopUrl or 'release_date' in TopUrl) and 'desc' in Sort:
      continue
    for n in rng:
      url = f'{TopUrl}&sort={Sort}&start={n}'
      response = requests.get(url)
      soup = BeautifulSoup(response.text, "html.parser")
      movie_div = soup.find_all('div', class_='lister-item mode-advanced')
      for container in movie_div:
        #Type
        Type.append(TopUrl+Sort)
        
        #Title ID
        tit = container.find("a")['href']
        ID.append(tit[tit.find(start)+len(start):tit.rfind(end)])
  
        #Rank
        rnk = container.find("span",{"class":"lister-item-index"})
        Rank.append(int(rnk.text[:-1].replace(',','')))

        #IMDb rating
        if container.strong:
          imdb = float(container.strong.text)
          Rating.append(imdb)
        else:
          Rating.append(-0.1)

        #here are two NV containers, grab both of them as they hold both the votes and the grosses
        nv = container.find_all('span', attrs={'name': 'nv'})
        
        #filter nv for votes
        if nv:
          try:
            vote = int(nv[0].text.replace(',',''))
          except ValueError:
            vote = 0
          Votes.append(vote)
        else:
          Votes.append(-1)
        
        # boxoffice dummy
        BO.append(-1)

# Extract All Time Boxoffice Details

In [0]:
rng = range(0,1000,200)
for n in rng:
        url = BoxOfficeChart + str(n)
        response = requests.get(url)
        page_soup = BeautifulSoup(response.text, "lxml")
        table = page_soup.find_all('table')[0].find_all("tr")
        for row in table:
          row_cols = row.find_all('td')
          if row_cols:
            Type.append(url)
            
            Rank.append(int(row_cols[0].text.replace(",","")))
            
            tit = row_cols[1].find("a")['href']
            ID.append(tit[tit.find(start)+len(start):tit.rfind(end)])
            
            BO.append(int(row_cols[2].text.replace("$","").replace(",","")))
            
            Rating.append(-0.1)
            
            Votes.append(-1)   

# Create Dataframe and build t_imdb_top table

In [0]:
imdb_df = ps.DataFrame({
    "tconst": ID,
    "rnk": Rank,
    "type": Type,
    "rating": Rating,
    "votes": Votes,
    "boxoffice": BO
})

imdb_df = imdb_df.to_spark()
lang_df = lang_df.to_spark()

imdb_df = (
  imdb_df
  .filter(~((F.col("type").like("%top_1000%")) & (F.col("type").like("%desc%"))))
  .withColumn("lang_code", F.when(F.col("type").like("%primary_language%,%"), F.substring(F.substring_index(F.col("type"), '=', -1),1,2))
                            .when(F.col("type").like("%malayalam%"), F.lit('ml'))
                            .when(F.col("type").like("%telugu%"), F.lit('te'))
                            .when(F.col("type").like("%tamil%"), F.lit('ta')))
)

imdb_df = imdb_df.join(lang_df, "lang_code", 'leftouter')

imdb_df = (
  imdb_df
  .select("tconst",
          "lang_name",
          F.col("rnk").cast("int"),
          F.col("type").alias("url"),
          F.when(F.col("rating") >= 0, F.col("rating")).cast("float").alias("rating"),
          F.when(F.col("votes") >= 0, F.col("votes")).cast("int").alias("votes"),
          F.when(F.col("boxoffice") >= 0, F.col("boxoffice")).cast("int").alias("box_office"),
          F.when(F.col("type").isin(['toptv','top','top-rated-indian-movies','top-rated-malayalam-movies','top-rated-tamil-movies','top-rated-telugu-movies']), F.lit('Y')).otherwise(F.lit('N')).alias("is_in_top_250"),
          F.when(F.col("type").like("%top_1000%"), F.lit('Y')).otherwise(F.lit('N')).alias("is_in_top_1000"),
          F.when(F.col("type").like("%release_date%moviemeter%"), F.lit('Y')).otherwise(F.lit('N')).alias("is_popular"),
          F.when(F.col("type").like("%primary_language%,%"), F.lit('Y')).otherwise(F.lit('N')).alias("is_primary_lang"),
          F.when(F.substring("type",-3,3) == 'asc', F.lit('Y')).otherwise(F.lit('N')).alias("is_asc"),
          F.when(F.substring("type",-3,3) == 'esc', F.lit('Y')).otherwise(F.lit('N')).alias("is_desc"))
)

(
  imdb_df
  .write
  .format("delta")
  .mode("overwrite")
  .option("overwriteSchema","true")
  .saveAsTable(f"silver.{topTableName}")
)

# Extract Yearly Box Office Details

In [0]:
MovieName = []
MovieYear = []
BoxOffice = []
current_year = date.today().year
rng = range(1977,current_year+1,1)
for n in rng:
  MainUrl = BoxOfficeYear + str(n)
  response = requests.get(MainUrl)
  page_soup = BeautifulSoup(response.text, "lxml")
  table = page_soup.find_all('table')[0].find_all("tr")
  for row in table:
    row_cols = row.find_all('td')
    if row_cols:
      MovieName.append(row_cols[1].text)
      MovieYear.append(n)
      BoxOffice.append(int(row_cols[2].text.replace("$","").replace(",","")))

# Create Dataframe and build t_bo table

In [0]:
bo_df = ps.DataFrame({
    "movie_name": MovieName,
    "movie_year": MovieYear,
    "box_office_usd": BoxOffice
})
bo_df = bo_df.to_spark()

(
  bo_df
 .select("movie_name",
        F.col("movie_year").cast("string"),
        F.col("box_office_usd").cast("int"))
 .write
 .format("delta")
 .mode("overwrite")
 .option("overwriteSchema","true")
 .saveAsTable(f"silver.{boTableName}")
)

#Download the datasets to driver and move to Raw storage folders

In [0]:
for file in FileList:
  tablename = file[:-7].replace("-co","")
  extn = file[-3:]
  BaseURL = DatasetURL + file
  DriverPath = f"file:/databricks/driver/{file}"
  dbfsPath = f"dbfs:{rawPath}{tablename}/{file}"
  urlretrieve(BaseURL,file)
  dbutils.fs.mv(DriverPath, dbfsPath)

#Create Silver Delta tables

## Create function for reading from raw and writing to silver

In [0]:
def load_table(rawFolderName):
  rawFilePath = rawPath + rawFolderName
  silverTableName = "t_"+rawFolderName.replace(".","_")
  silverSavePath = silverPath + silverTableName
  delim = "\t"
  df = (
    spark
    .read
    .format("csv")
    .option("inferSchema", "false")
    .option("header","true")
    .option("delimiter",delim)
    .load(rawFilePath)
  )
  colToChange = {'averageRating':'float', 'numVotes':'int', 'startYear':'int', 'runtimeMinutes':'int'}
  dfColToChange= {k:v for (k,v) in colToChange.items() if k in df.columns}
  for colName, dataType in dfColToChange.items():
    df= df.withColumn(colName, F.expr(f"try_cast({colName} as {dataType})"))
  (
    df
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema","true")
    .saveAsTable(f"silver.{silverTableName}")
  )

## Execute the load_table function for multiple tables in parallel

In [0]:
from threading import Thread
from queue import Queue

q = Queue()

worker_count = 5 # Number of tables which will be loaded in parallel

def run_tasks(function, q):
    while not q.empty():
        value = q.get()
        function(value)
        q.task_done()

for rawFolderName in RawFolderList:
    q.put(rawFolderName)

for i in range(worker_count):
    t=Thread(target=run_tasks, args=(load_table, q))
    t.daemon = True
    t.start()

q.join()

#Create Gold Table (PySpark approach)

In [0]:
# Declare Dataframes
tbs = spark.table('silver.t_title_basics').alias("tbs")
trt = spark.table('silver.t_title_ratings').alias("trt")
tnb = spark.table('silver.t_name_basics').alias("tnb")
imd = spark.table('silver.t_imdb_top').alias("imd")
tps = spark.table('silver.t_title_principals').alias("tps")
tcr = spark.table('silver.t_title_crew').alias("tcr")
tbo = spark.table('silver.t_bo').alias("tbo")

# Get Crew informations
tps = (
     tps
     .filter(F.col("category").isin(['actor', 'actress']))
     .select("tconst", 
             "category", 
             "nconst")
)
tcr = (
     tcr
     .filter(~F.col("directors").like("_N"))
     .select("tconst", 
             F.lit("director").alias("category"), 
             F.explode(F.split(F.col("directors"), ',')).alias("nconst"))
)
tps = tps.union(tcr)

# Get Box office informations
tbo = (
     tbo
     .join(tbs,(tbo.movie_name == tbs.primaryTitle) & (tbo.movie_year == tbs.startYear) & (tbs.titleType == F.lit("movie")), 'inner')
     .join(trt, trt.tconst == tbs.tconst, 'inner')
)
tbo = tbo.withColumn("rnk", F.row_number().over(Window
                                               .partitionBy("tbs.primaryTitle","tbs.startYear")
                                               .orderBy(F.col("trt.numvotes").desc())))
tbo = (
     tbo
     .filter(F.col("rnk") == 1)
     .select(F.col("tbs.tconst").alias("tconst"), "tbo.box_office_usd")
)

# Final dataframe which is one row per title
tbs = tbs.filter(F.col("tbs.titletype").isin(['movie','tvMiniSeries','short','tvSeries','tvShort','tvSpecial']))
tbs = (
     tbs
     .join(trt, trt.tconst == tbs.tconst, 'leftouter')
     .join(tps, tps.tconst == tbs.tconst, 'leftouter')
     .join(tnb, tnb.nconst == tps.nconst, 'leftouter')
     .join(imd, imd.tconst == tbs.tconst, 'leftouter')
     .join(tbo, tbo.tconst == tbs.tconst, 'leftouter')
)
tbs = (
     tbs
     .groupBy("tbs.tconst")
     .agg(F.max(F.regexp_replace(F.initcap("tbs.titletype"), 'Tv', 'TV ')).alias("title_type"),
          F.max("tbs.primarytitle").alias("primary_title"),
          F.max("tbs.originaltitle").alias("original_title"),
          F.max("tbs.startyear").alias("yr"),
          F.max(F.when(F.col("tbs.isadult") == 1,"Y").otherwise("N")).alias("is_adult"),
          F.max("tbs.runtimeminutes").alias("runtime_min"),
          F.max(F.when(~F.col("tbs.genres").like("_N"), F.col("genres"))).alias("genres"),
          F.coalesce(F.max("imd.rating"), F.max("trt.averagerating")).alias("avg_rating"),
          F.coalesce(F.max("imd.votes"), F.max("trt.numvotes")).alias("num_votes"),
          F.abs(F.coalesce(F.max("imd.box_office"), F.max("tbo.box_office_usd"))).alias("box_office"),
          F.max(F.when(F.col("imd.is_in_top_250") == 'Y', F.col("imd.rnk"))).alias("top_250_rnk"),
          F.max(F.when(F.col("imd.is_in_top_1000") == 'Y', F.col("imd.rnk"))).alias("top_1000_rnk"),
          F.max(F.when(F.col("imd.is_popular") == 'Y', F.col("imd.rnk"))).alias("popularity_rnk"),
          F.max(F.when((F.col("imd.is_primary_lang") == 'Y') & (F.col("imd.is_asc") == 'Y'), F.col("rnk"))).alias("language_popularity_rnk"),
          F.max(F.when((F.col("imd.is_primary_lang") == 'Y') & (F.col("imd.is_desc") == 'Y'), F.col("rnk"))).alias("language_votes_rnk"),
          F.coalesce(F.max("imd.is_in_top_1000"), F.lit('N')).alias("is_top_1000_movies"),
          F.concat_ws('; ', F.collect_set("imd.lang_name")).alias("language_lst"),
          F.concat_ws('; ', F.collect_set(F.when(F.col("tps.category") == 'director', F.col("tnb.primaryname")))).alias("director_lst"),
          F.concat_ws('; ', F.collect_set(F.when(F.col("tps.category") == 'actor', F.col("tnb.primaryname")))).alias("actor_lst"),
          F.concat_ws('; ', F.collect_set(F.when(F.col("tps.category") == 'actress', F.col("tnb.primaryname")))).alias("actress_lst"),
          F.lit(F.current_date()).alias("last_refresh_date"))
)
tbs = (
     tbs
     .select([
              F.when(F.col(c) == "", None).otherwise(F.col(c)).alias(c)
              if c in ["language_lst", "director_lst", "actor_lst", "actress_lst"]
              else F.col(c)
              for c in tbs.columns
              ])
)
(
 tbs
 .write
 .format("delta")
 .mode("overwrite")
 .option("overwriteSchema","true")
 .saveAsTable(f"gold.{goldTableName}")
)

# Create Gold Table (SQL approach)
#### The code below is in markdown form to avoid gold table being created twice. To execute it, change it from markdown to sql.

CREATE
OR REPLACE TABLE gold.t_imdb AS 
with tps AS (
  SELECT
    tconst,
    nconst,
    category
  FROM
    silver.t_title_principals
  WHERE
    category IN ('actor', 'actress')
  UNION ALL
  SELECT
    tconst,
    explode(split(directors, ',')) AS ncost,
    'director' AS category
  FROM
    silver.t_title_crew
  WHERE
    directors NOT LIKE '_N'
),
tbo AS (
  SELECT
    tbs.tconst,
    bo.box_office_usd,
    row_number() over(
      partition BY tbs.primarytitle,
      tbs.startyear
      ORDER BY
        trt.numvotes DESC nulls last
    ) AS rnk
  FROM
    silver.t_bo bo
    INNER JOIN silver.t_title_basics tbs ON (
      tbs.primarytitle = bo.movie_name
      AND tbs.startyear = bo.movie_year
      AND tbs.titletype = 'movie'
    )
    INNER JOIN silver.t_title_ratings trt ON (trt.tconst = tbs.tconst)
)
SELECT
  tbs.tconst,
  MAX(tbs.titletype) AS title_type,
  MAX(tbs.primarytitle) AS primary_title,
  MAX(tbs.originaltitle) AS original_title,
  MAX(tbs.startyear) AS yr,
  MAX(
    CASE
      WHEN tbs.isadult = '1' THEN 'Y'
      ELSE 'N'
    end
  ) AS is_adult,
  MAX(tbs.runtimeminutes) AS runtime_min,
  MAX(
    CASE
      WHEN tbs.genres NOT LIKE '_N' THEN tbs.genres
    end
  ) AS genres,
  coalesce(
    MAX(imd.rating),
    MAX(trt.averagerating)
  ) AS avg_rating,
  coalesce(
    MAX(imd.votes),
    MAX(trt.numvotes)
  ) AS num_votes,
  coalesce(
    MAX(imd.box_office),
    MAX(tbo.box_office_usd)
  ) AS box_office,
  MAX(
    CASE
      WHEN imd.is_in_top_250 = 'Y' THEN imd.rnk
    end
  ) AS top_250_rnk,
  MAX(
    CASE
      WHEN imd.is_in_top_1000 = 'Y' THEN imd.rnk
    end
  ) AS top_1000_rnk,
  MAX(
    CASE
      WHEN imd.is_popular = 'Y' THEN imd.rnk
    end
  ) AS popularity_rnk,
  MAX(
    CASE
      WHEN imd.is_primary_lang = 'Y'
      AND imd.is_asc = 'Y' THEN imd.rnk
    end
  ) AS language_popularity_rnk,
  MAX(
    CASE
      WHEN imd.is_primary_lang = 'Y'
      AND imd.is_desc = 'Y' THEN imd.rnk
    end
  ) AS language_votes_rnk,
  coalesce(MAX(imd.is_in_top_1000), 'N') AS is_top_1000_movies,
  NULLIF(concat_ws('; ', collect_set(imd.lang_name)), '') AS language_lst,
  NULLIF(
    concat_ws(
      '; ',
      collect_set(
        CASE
          WHEN tps.category = 'actor' THEN tnb.primaryname
        end
      )
    ),
    ''
  ) AS actor_lst,
  NULLIF(
    concat_ws(
      '; ',
      collect_set(
        CASE
          WHEN tps.category = 'actress' THEN tnb.primaryname
        end
      )
    ),
    ''
  ) AS actress_lst,
  NULLIF(
    concat_ws(
      '; ',
      collect_set(
        CASE
          WHEN tps.category = 'director' THEN tnb.primaryname
        end
      )
    ),
    ''
  ) AS director_lst,
  current_date() AS last_refresh_date
FROM
  silver.t_title_basics tbs
  LEFT OUTER JOIN silver.t_title_ratings trt ON (trt.tconst = tbs.tconst)
  LEFT OUTER JOIN tps ON (tps.tconst = tbs.tconst)
  LEFT OUTER JOIN silver.t_name_basics tnb ON (tnb.nconst = tps.nconst)
  LEFT OUTER JOIN silver.t_imdb_top imd ON (imd.tconst = tbs.tconst)
  LEFT OUTER JOIN tbo ON (
    tbo.tconst = tbs.tconst
    AND tbo.rnk = 1
  )
WHERE
  tbs.titletype IN (
    'movie',
    'tvMiniSeries',
    'short',
    'tvSeries',
    'tvShort',
    'tvSpecial'
  )
GROUP BY
  tbs.tconst

#Create view for Reporting

In [0]:
%sql CREATE
OR REPLACE VIEW v_imdb AS
select
  tconst as `IMDB ID`,
  title_type as `Title Type`,
  primary_title as `Primary Title`,
  original_title as `Original Title`,
  yr as `Release Year`,
  is_adult as `Is Adult`,
  runtime_min as `Runtime in Min`,
  genres as `Generes`,
  top_250_rnk as `Top 250 Rank`,
  row_number() over(
    order by
      popularity_rnk asc nulls last,
      language_popularity_rnk asc nulls last,
      top_1000_rnk asc nulls last,
      language_votes_rnk asc nulls last,
      num_votes desc nulls last
  ) as `Popularity Rank`,
  is_top_1000_movies as `Is in Top 1000 Movies`,
  language_lst as `Languages`,
  avg_rating,
  num_votes,
  box_office,
  director_lst as `Directors`,
  actor_lst as `Actors`,
  actress_lst as `Actresses`,
  last_refresh_date as `Last Refresh Date`
from
  gold.t_imdb