[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/anwala/teaching-web-science/blob/main/fall-2023/week-3/data_440_02_f23_mod_03_web_scraping_imdb.ipynb)

# DATA 440-03 - Scraping IMDB for Director credits

Actors, actresses, and directors are uniquely identified by their [IMDB](https://www.imdb.com/) ID. For example, [Wes Anderson](https://www.imdb.com/name/nm0027572/)'s IMBD ID is `nm0027572`.

Below, we outline the steps to scrape his IMBD director credits page: [https://www.imdb.com/name/nm0027572/fullcredits/](https://www.imdb.com/name/nm0027572/fullcredits/)

#### Requirements:
* Install BeautifulSoup4
* Install my custom HTTP client wrapper library, [NwalaTextUtils](https://github.com/oduwsdl/NwalaTextUtils)

In [None]:
!pip install BeautifulSoup4
!pip install NwalaTextUtils

### Step 1: Import libraries and functions and define helper functions

In [None]:
from bs4 import BeautifulSoup
from NwalaTextUtils.textutils import derefURI
from NwalaTextUtils.textutils import genericErrorInfo
from NwalaTextUtils.textutils import getPgTitleFrmHTML

'''
  The director credits page often contains multiple sections such as, Writer, Producer, Actor, Director, extract only the "Director" section
'''
def get_movies_section(soup):

  film_sections = soup.find_all(class_='filmo-category-section')
  for fs in film_sections:

    is_director_section = fs.find('div')
    if( is_director_section is not None and is_director_section.get('id', '').startswith('director-') ):
      return fs

  return None

'''
  Extract movie link (format:) from movie element
'''
def get_movie_link(mov_elm, mov_year):

  mov_links = mov_elm.find_all('a')

  for m in mov_links:

    title = m.text.strip()
    m = m['href'].strip()
    mov_year = mov_year.text.strip() if mov_year is not None else ''

    if( m.startswith('/title/tt') ):
      movie_note = mov_elm.text.replace(title, '').replace(mov_year, '')
      movie_note = ' '.join(movie_note.split())
      return {'title': title, 'uri': f'https://www.imdb.com{m}', 'year': mov_year, 'note': movie_note}

  return {}

### Step 2: Dereference URI of IMDB credits page


In [None]:
dir_id = 'nm0027572'
uri = f'https://www.imdb.com/name/{dir_id}/fullcredits/'

#dereference URI of IMDB credits page and retrieve HTML representation
html_pg = derefURI(uri)
title = ''
soup = None

print('HTML page:')
html_pg

### Step 3: Create BeautifulSoup object

In [None]:
try:
  #convert raw html to BeautifulSoup object
  soup = BeautifulSoup(html_pg, 'html.parser')
  title = getPgTitleFrmHTML(html_pg)
  title = title.split('-')[0].strip()
except:
  genericErrorInfo()

### Step 4: Extract "movies" section and load all movies from section

In [None]:
if( soup is not None ):
  movies = get_movies_section(soup)
  print('Movies section')
  print(movies)

  #find all div elements with class="filmo-row"
  movies = movies.find_all('div', class_='filmo-row')

  print('\n\nMovies:', len(movies))
  print(movies)

### Step 5: Create dictionary structure for movies

In [None]:
dir_credits = {'director_name': title, 'imdb_uri': uri, 'credits': []}
for m in movies:

  m = get_movie_link(m, m.find(class_='year_column'))
  if( len(m) != 0 ):
    dir_credits['credits'].append(m)

print('Director credits:')
for d in dir_credits['credits']:
  print(d)

Director credits:
{'title': 'The Wonderful Story of Henry Sugar', 'uri': 'https://www.imdb.com/title/tt16968450/?ref_=nm_flmg_dr_1', 'year': '2023', 'note': '(Short) (completed)'}
{'title': 'Asteroid City Location Featurette', 'uri': 'https://www.imdb.com/title/tt28259927/?ref_=nm_flmg_dr_2', 'year': '2023', 'note': '(Video short)'}
{'title': 'Asteroid City', 'uri': 'https://www.imdb.com/title/tt14230388/?ref_=nm_flmg_dr_3', 'year': '2023', 'note': '(directed by)'}
{'title': 'Tip-Top: Aline', 'uri': 'https://www.imdb.com/title/tt21071898/?ref_=nm_flmg_dr_4', 'year': '2021', 'note': '(Music Video short)'}
{'title': 'The French Dispatch', 'uri': 'https://www.imdb.com/title/tt8847712/?ref_=nm_flmg_dr_5', 'year': '2021', 'note': ''}
{'title': 'Isle of Dogs', 'uri': 'https://www.imdb.com/title/tt5104604/?ref_=nm_flmg_dr_6', 'year': '2018', 'note': ''}
{'title': 'Come Together: A Fashion Picture in Motion', 'uri': 'https://www.imdb.com/title/tt6282412/?ref_=nm_flmg_dr_7', 'year': '2016', 'no