#Install Packages

In [0]:
pip install lxml html5lib beautifulsoup4

#Initialize URLs

In [0]:
DatasetURL = "https://datasets.imdbws.com/"
BoxOfficeUrl = "https://www.boxofficemojo.com"
LanguageURL = "https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes"
IMDBURL = "https://www.imdb.com/"
ChartURL = f"{IMDBURL}chart/"
IndiaURL = f"{IMDBURL}india/"
IMDBSearchURL = f"{IMDBURL}search/title/"
Top1000URL = f"{IMDBSearchURL}?groups=top_1000"
LangURL = f"{IMDBSearchURL}?primary_language"
BoxOfficeChart = f"{BoxOfficeUrl}/chart/ww_top_lifetime_gross/?area=XWW&offset="
BoxOfficeYear = f"{BoxOfficeUrl}/year/world/"

#Initialize variables

In [0]:
basePath = "/user/IMDB/"
rawPath = f"{basePath}raw/"
silverPath = f"{basePath}silver/"
goldPath = f"{basePath}gold/"
topTableName = "t_imdb_top"
boTableName = "t_bo"
goldTableName = "t_imdb"
goldTablePath = goldPath + goldTableName
FileList = ['name.basics.tsv.gz', 'title.basics.tsv.gz', 'title.crew.tsv.gz', 'title.principals.tsv.gz', 'title.ratings.tsv.gz']
RawFolderList= [file[:-7].replace("-co","") for file in FileList]
SilverTableList = ["t_"+folderName.replace(".","_").replace("-co","") for folderName in RawFolderList]
FullTableList = SilverTableList + [goldTableName] + [topTableName] + [boTableName]

# Import Required Functions

In [0]:
from pyspark.sql.functions import col,count, when, lit, substring, substring_index, explode, split, regexp_replace, initcap, coalesce, concat_ws, collect_set, abs, min, max, current_date, row_number, expr
from pyspark.sql.window import Window
import pyspark.pandas as ps
import requests
from bs4 import BeautifulSoup
from datetime import date
from urllib.request import urlretrieve

#Drop all tables/folders if exists

In [0]:
dbutils.fs.rm(f"dbfs:{basePath}", recurse = True)
for tbl in FullTableList:
  spark.sql(f"DROP TABLE IF EXISTS {tbl}")

# Create Schema

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS silver location '{silverPath}'");
spark.sql(f"CREATE SCHEMA IF NOT EXISTS gold location '{goldPath}'");

# Extract Language Codes from Wiki & create spark dataframe

In [0]:
lang_df = ps.read_html(LanguageURL)[0][["ISO language name","639-1"]]
lang_df.columns = ['lang_name','lang_code']
lang_spark_df = lang_df.to_spark()

#Extract all Top Rated Movies

In [0]:
ID = []
Type = []
Rank = []
Votes =[]
Rating = []
BO = []
start = "/title/"
end = "/"
urlList = [f'{ChartURL}moviemeter/', f'{ChartURL}tvmeter/', f'{ChartURL}toptv/', f'{ChartURL}top/', f'{IndiaURL}top-rated-indian-movies/', f'{IndiaURL}top-rated-malayalam-movies/', f'{IndiaURL}top-rated-tamil-movies/', f'{IndiaURL}top-rated-telugu-movies/']
for url in urlList:
  response = requests.get(url)
  soup = BeautifulSoup(response.text, "html.parser")
  ID = ID + [a.attrs.get('href')[a.attrs.get('href').find(start)+len(start):a.attrs.get('href').rfind(end)] for a in soup.select('td.titleColumn a')]
  Rank= Rank + [int(b.attrs.get('data-value')) for b in soup.select('td.posterColumn span[name=rk]')]
  Type = Type + [url[url.find(ChartURL)+len(ChartURL):url.rfind(end)].replace("/","") for a in soup.select('td.titleColumn a')]
  Rating = Rating + [-0.1 for a in soup.select('td.titleColumn a')]
  Votes = Votes + [-1 for a in soup.select('td.titleColumn a')]
  BO = BO + [-1 for a in soup.select('td.titleColumn a')]

#Extract Top 1000 Movies + Popular Language-wise Movies list

In [0]:
TopSearchURLs = [f"{IMDBSearchURL}?groups=top_1000", f"{LangURL}=ml", f"{LangURL}=ta", f"{LangURL}=hi", f"{LangURL}=te", f"{LangURL}=kn", f"{LangURL}=ko", f"{LangURL}=yue", f"{LangURL}=ja", f"{LangURL}=en"]
SortSyntax = ["moviemeter,asc","num_votes,desc"]
rng = range(1,1000,50)
for TopUrl in TopSearchURLs:
  for Sort in SortSyntax:
    for n in rng:
      url = f'{TopUrl}&sort={Sort}&start={n}'
      response = requests.get(url)
      soup = BeautifulSoup(response.text, "html.parser")
      movie_div = soup.find_all('div', class_='lister-item mode-advanced')
      for container in movie_div:
        #Type
        Type.append(TopUrl+Sort)
        
        #Title ID
        tit = container.find("a")['href']
        ID.append(tit[tit.find(start)+len(start):tit.rfind(end)])
  
        #Rank
        rnk = container.find("span",{"class":"lister-item-index"})
        Rank.append(int(rnk.text[:-1].replace(',','')))

        #IMDb rating
        if container.strong:
          imdb = float(container.strong.text)
          Rating.append(imdb)
        else:
          Rating.append(-0.1)

        #here are two NV containers, grab both of them as they hold both the votes and the grosses
        nv = container.find_all('span', attrs={'name': 'nv'})
        
        #filter nv for votes
        if nv:
          try:
            vote = int(nv[0].text.replace(',',''))
          except ValueError:
            vote = 0
          Votes.append(vote)
        else:
          Votes.append(-1)
        
        # boxoffice dummy
        BO.append(-1)

# Extract All Time Boxoffice Details

In [0]:
rng = range(0,1000,200)
for n in rng:
        url = BoxOfficeChart + str(n)
        response = requests.get(url)
        page_soup = BeautifulSoup(response.text, "lxml")
        table = page_soup.find_all('table')[0].find_all("tr")
        for row in table:
          row_cols = row.find_all('td')
          if row_cols:
            Type.append(url)
            
            Rank.append(int(row_cols[0].text.replace(",","")))
            
            tit = row_cols[1].find("a")['href']
            ID.append(tit[tit.find(start)+len(start):tit.rfind(end)])
            
            BO.append(int(row_cols[2].text.replace("$","").replace(",","")))
            
            Rating.append(-0.1)
            
            Votes.append(-1)   

# Extract Yearly Box Office Details

In [0]:
MovieName = []
MovieYear = []
BoxOffice = []
current_year = date.today().year
rng = range(1977,current_year+1,1)
for n in rng:
  MainUrl = BoxOfficeYear + str(n)
  response = requests.get(MainUrl)
  page_soup = BeautifulSoup(response.text, "lxml")
  table = page_soup.find_all('table')[0].find_all("tr")
  for row in table:
    row_cols = row.find_all('td')
    if row_cols:
      MovieName.append(row_cols[1].text)
      MovieYear.append(n)
      BoxOffice.append(int(row_cols[2].text.replace("$","").replace(",","")))

# Create Dataframe and build t_imdb_top table

In [0]:
imdb_df = ps.DataFrame({
    "tconst": ID,
    "rnk": Rank,
    "type": Type,
    "rating": Rating,
    "votes": Votes,
    "boxoffice": BO
}).to_spark()

(imdb_df
 .filter(~((col("type").like("%top_1000%")) & (col("type").like("%desc%"))))
 .withColumn("lang_code", when(col("type").like("%primary_language%,%"), substring(substring_index(col("type"), '=', -1),1,2))
                               .when(col("type").like("%malayalam%"), lit('ml'))
                               .when(col("type").like("%telugu%"), lit('te'))
                               .when(col("type").like("%tamil%"), lit('ta')))
 .join(lang_spark_df, "lang_code", 'leftouter')
 .select("tconst",
         "lang_name",
         col("rnk").cast("int"),
         col("type").alias("url"),
         when(col("rating") >= 0, col("rating")).cast("float").alias("rating"),
         when(col("votes") >= 0, col("votes")).cast("int").alias("votes"),
         when(col("boxoffice") >= 0, col("boxoffice")).cast("int").alias("box_office"),
         when(col("type").isin(['toptv','top','top-rated-indian-movies','top-rated-malayalam-movies','top-rated-tamil-movies','top-rated-telugu-movies']), lit('Y')).otherwise(lit('N')).alias("is_in_top_250"),
         when(col("type").like("%top_1000%"), lit('Y')).otherwise(lit('N')).alias("is_in_top_1000"),
         when(col("type").isin(['tvmeter', 'moviemeter']), lit('Y')).otherwise(lit('N')).alias("is_popular"),
         when(col("type").like("%primary_language%,%"), lit('Y')).otherwise(lit('N')).alias("is_primary_lang"),
         when(substring("type",-3,3) == 'asc', lit('Y')).otherwise(lit('N')).alias("is_asc"),
         when(substring("type",-3,3) == 'esc', lit('Y')).otherwise(lit('N')).alias("is_desc"))
 .write
 .format("delta")
 .mode("overwrite")
 .option("overwriteSchema","true")
 .saveAsTable(f"silver.{topTableName}")
)

# Create Dataframe and build t_bo table

In [0]:
bo_df = ps.DataFrame({
    "movie_name": MovieName,
    "movie_year": MovieYear,
    "box_office_usd": BoxOffice
}).to_spark()

(bo_df
 .select("movie_name",
        col("movie_year").cast("string"),
        col("box_office_usd").cast("int"))
 .write
 .format("delta")
 .mode("overwrite")
 .option("overwriteSchema","true")
 .saveAsTable(f"silver.{boTableName}")
)

#Download the datasets to driver and move to Raw storage folders

In [0]:
for file in FileList:
  tablename = file[:-7].replace("-co","")
  extn = file[-3:]
  BaseURL = DatasetURL + file
  DriverPath = f"file:/databricks/driver/{file}"
  dbfsPath = f"dbfs:{rawPath}{tablename}/{file}"
  urlretrieve(BaseURL,file)
  dbutils.fs.mv(DriverPath, dbfsPath)

#Create Silver Delta tables

## Create function for reading from raw and writing to silver

In [0]:
def load_table(rawFolderName):
  rawFilePath = rawPath + rawFolderName
  silverTableName = "t_"+rawFolderName.replace(".","_")
  silverSavePath = silverPath + silverTableName
  delim = "\t"
  df = (spark
        .read
        .format("csv")
        .option("inferSchema", "false")
        .option("header","true")
        .option("delimiter",delim)
        .load(rawFilePath)
        .write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema","true")
        .saveAsTable(f"silver.{silverTableName}")
       )

## Execute the load_table function for multiple tables in parallel

In [0]:
from threading import Thread
from queue import Queue

q = Queue()

worker_count = 5 # Number of tables which will be loaded in parallel

def run_tasks(function, q):
    while not q.empty():
        value = q.get()
        function(value)
        q.task_done()

for rawFolderName in RawFolderList:
    q.put(rawFolderName)

for i in range(worker_count):
    t=Thread(target=run_tasks, args=(load_table, q))
    t.daemon = True
    t.start()

q.join()

#Create Gold Table (PySpark approach)

In [0]:
act_df = (spark
           .table('silver.t_title_principals')
           .filter(col("category").isin(['actor', 'actress']))
           .select("tconst", 
                   "category", 
                   "nconst")
          )
dir_df = (spark
           .table('silver.t_title_crew')
           .filter(~col("directors").like("_N"))
           .select("tconst", 
                   lit("director").alias("category"), 
                   explode(split(col("directors"), ',')).alias("nconst"))
          )
crew_df = (act_df
           .union(dir_df)          
          )
bo_office_df = (spark
                .table('silver.t_bo')
                .join(spark.table('silver.t_title_basics'),(col("movie_name") == col("primarytitle")) & (col("movie_year") == col("startyear")) & (col("titletype") == lit("movie")), 'inner')
                .join(spark.table('silver.t_title_ratings'),"tconst", 'inner')
                .withColumn("rnk", row_number().over(Window.partitionBy("primarytitle","startyear").orderBy(col("numvotes").cast("int").desc())))
                .filter(col("rnk") == 1)
                .select("tconst", "box_office_usd")
              )
final_df = (spark
            .table('silver.t_title_basics')
            .filter(col("titletype").isin(['movie','tvMiniSeries','short','tvSeries','tvShort','tvSpecial']))
            .join(spark.table('silver.t_title_ratings'), "tconst", 'leftouter')
            .join(crew_df, "tconst", 'leftouter')
            .join(spark.table('silver.t_name_basics'), "nconst", 'leftouter')
            .join(spark.table('silver.t_imdb_top'), "tconst", 'leftouter')
            .join(bo_office_df, "tconst", 'leftouter')
            .groupBy("tconst")
            .agg(max(regexp_replace(initcap("titletype"), 'Tv', 'TV ')).alias("title_type"),
                 max("primarytitle").alias("primary_title"),
                 max("originaltitle").alias("original_title"),
                 max(col("startyear").cast("int")).alias("yr"),
                 max(when(col("isadult") == 1,"Y").otherwise("N")).alias("is_adult"),
                 max(expr("try_cast(runtimeminutes as int)")).alias("runtime_min"),
                 max(when(~col("genres").like("_N"),col("genres"))).alias("genres"),
                 coalesce(max("rating"), max(col("averagerating").cast("float"))).alias("avg_rating"),
                 coalesce(max("votes"), max(col("numvotes").cast("int"))).alias("num_votes"),
                 abs(coalesce(max("box_office"), max("box_office_usd"))).alias("box_office"),
                 max(when(col("is_in_top_250") == 'Y',col("rnk"))).alias("top_250_rnk"),
                 max(when(col("is_in_top_1000") == 'Y',col("rnk"))).alias("top_1000_rnk"),
                 max(when(col("is_popular") == 'Y',col("rnk"))).alias("popularity_rnk"),
                 max(when((col("is_primary_lang") == 'Y') & (col("is_asc") == 'Y'),col("rnk"))).alias("language_popularity_rnk"),
                 max(when((col("is_primary_lang") == 'Y') & (col("is_desc") == 'Y'),col("rnk"))).alias("language_votes_rnk"),
                 coalesce(max("is_in_top_1000"), lit('N')).alias("is_top_1000_movies"),
                 concat_ws('; ', collect_set("lang_name")).alias("language_lst"),
                 concat_ws('; ', collect_set(when(col("category") == 'director', "primaryname"))).alias("director_lst"),
                 concat_ws('; ', collect_set(when(col("category") == 'actor', "primaryname"))).alias("actor_lst"),
                 concat_ws('; ', collect_set(when(col("category") == 'actress',"primaryname"))).alias("actress_lst"),
                 lit(current_date()).alias("last_refresh_date"))
           )
final_df = final_df.select(
    [
        when(col(c) == "", None).otherwise(col(c)).alias(c)
        if c in ["language_lst", "director_lst", "actor_lst", "actress_lst"]
        else col(c)
        for c in final_df.columns
    ]
)
(final_df
 .write
 .format("delta")
 .mode("overwrite")
 .option("overwriteSchema","true")
 .saveAsTable(f"gold.{goldTableName}")
)

# Create Gold Table (SQL approach)
#### The code below is in markdown form to avoid gold table being created twice. To execute it, change it from markdown to sql.

CREATE
OR REPLACE TABLE gold.t_imdb AS with crew_base AS (
  SELECT
    tconst,
    nconst,
    category
  FROM
    silver.t_title_principals
  WHERE
    category IN ('actor', 'actress')
  UNION ALL
  SELECT
    tconst,
    explode(split(directors, ',')) AS ncost,
    'director' AS category
  FROM
    silver.t_title_crew
  WHERE
    directors NOT LIKE '_N'
),
bo_base AS (
  SELECT
    tbs.tconst,
    bo.box_office_usd,
    row_number() over(
      partition BY tbs.primarytitle,
      tbs.startyear
      ORDER BY
        try_cast(trt.numvotes AS INT) DESC nulls last
    ) AS rnk
  FROM
    silver.t_bo bo
    INNER JOIN silver.t_title_basics tbs ON (
      tbs.primarytitle = bo.movie_name
      AND try_cast(tbs.startyear AS INT) = bo.movie_year
      AND tbs.titletype = 'movie'
    )
    INNER JOIN silver.t_title_ratings trt ON (trt.tconst = tbs.tconst)
)
SELECT
  tbs.tconst,
  MAX(tbs.titletype) AS title_type,
  MAX(tbs.primarytitle) AS primary_title,
  MAX(tbs.originaltitle) AS original_title,
  MAX(try_cast(tbs.startyear AS INT)) AS yr,
  MAX(
    CASE
      WHEN tbs.isadult = '1' THEN 'Y'
      ELSE 'N'
    end
  ) AS is_adult,
  MAX(try_cast(tbs.runtimeminutes AS INT)) AS runtime_min,
  MAX(
    CASE
      WHEN tbs.genres NOT LIKE '_N' THEN tbs.genres
    end
  ) AS genres,
  coalesce(
    MAX(imd.rating),
    MAX(
      try_cast(
        trt.averagerating AS DECIMAL(2, 1)
      )
    )
  ) AS avg_rating,
  coalesce(
    MAX(imd.votes),
    MAX(try_cast(trt.numvotes AS INT))
  ) AS num_votes,
  coalesce(
    MAX(imd.box_office),
    MAX(bo.box_office_usd)
  ) AS box_office,
  MAX(
    CASE
      WHEN imd.is_in_top_250 = 'Y' THEN imd.rnk
    end
  ) AS top_250_rnk,
  MAX(
    CASE
      WHEN imd.is_in_top_1000 = 'Y' THEN imd.rnk
    end
  ) AS top_1000_rnk,
  MAX(
    CASE
      WHEN imd.is_popular = 'Y' THEN imd.rnk
    end
  ) AS popularity_rnk,
  MAX(
    CASE
      WHEN imd.is_primary_lang = 'Y'
      AND imd.is_asc = 'Y' THEN imd.rnk
    end
  ) AS language_popularity_rnk,
  MAX(
    CASE
      WHEN imd.is_primary_lang = 'Y'
      AND imd.is_desc = 'Y' THEN imd.rnk
    end
  ) AS language_votes_rnk,
  coalesce(MAX(imd.is_in_top_1000), 'N') AS is_top_1000_movies,
  NULLIF(concat_ws('; ', collect_set(imd.lang_name)), '') AS language_lst,
  NULLIF(
    concat_ws(
      '; ',
      collect_set(
        CASE
          WHEN crb.category = 'actor' THEN nmb.primaryname
        end
      )
    ),
    ''
  ) AS actor_lst,
  NULLIF(
    concat_ws(
      '; ',
      collect_set(
        CASE
          WHEN crb.category = 'actress' THEN nmb.primaryname
        end
      )
    ),
    ''
  ) AS actress_lst,
  NULLIF(
    concat_ws(
      '; ',
      collect_set(
        CASE
          WHEN crb.category = 'director' THEN nmb.primaryname
        end
      )
    ),
    ''
  ) AS director_lst,
  current_date() AS last_refresh_date
FROM
  silver.t_title_basics tbs
  LEFT OUTER JOIN silver.t_title_ratings trt ON (trt.tconst = tbs.tconst)
  LEFT OUTER JOIN crew_base crb ON (crb.tconst = tbs.tconst)
  LEFT OUTER JOIN silver.t_name_basics nmb ON (nmb.nconst = crb.nconst)
  LEFT OUTER JOIN silver.t_imdb_top imd ON (imd.tconst = tbs.tconst)
  LEFT OUTER JOIN bo_base bo ON (
    bo.tconst = tbs.tconst
    AND bo.rnk = 1
  )
WHERE
  tbs.titletype IN (
    'movie',
    'tvMiniSeries',
    'short',
    'tvSeries',
    'tvShort',
    'tvSpecial'
  )
GROUP BY
  tbs.tconst

#Create view for Reporting

In [0]:
%sql CREATE
OR REPLACE VIEW v_imdb AS
select
  tconst as `IMDB ID`,
  title_type as `Title Type`,
  primary_title as `Primary Title`,
  original_title as `Original Title`,
  yr as `Release Year`,
  is_adult as `Is Adult`,
  runtime_min as `Runtime in Min`,
  genres as `Generes`,
  top_250_rnk as `Top 250 Rank`,
  row_number() over(
    order by
      popularity_rnk asc nulls last,
      language_popularity_rnk asc nulls last,
      top_250_rnk asc nulls last,
      top_1000_rnk asc nulls last,
      language_votes_rnk asc nulls last,
      num_votes desc nulls last
  ) as `Popularity Rank`,
  is_top_1000_movies as `Is in Top 1000 Movies`,
  language_lst as `Languages`,
  avg_rating,
  num_votes,
  box_office,
  director_lst as `Directors`,
  actor_lst as `Actors`,
  actress_lst as `Actresses`,
  last_refresh_date as `Last Refresh Date`
from
  gold.t_imdb