# Movie Data Scraping

## Part 1. Scraping from IMDB

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
from dateutil.parser import parse

In [2]:
def crawl()->list:
    base_url = 'https://www.imdb.com/'
    html = requests.get("https://www.imdb.com/chart/moviemeter?sort=rk,asc&mode=simple&page=1").text
    soup = BeautifulSoup(html, 'lxml')
    movie_table = soup.tbody
    return [base_url + movie_row.td.a['href'] for movie_row in movie_table.findAll('tr')]
    
movie_links = crawl()

In [4]:
ratings = ["PG-13", 'R', "PG", 'G']
genres = ['Action', 'Adventure', 'Sci-Fi', 'Family', 'Drama', 'Romance', 'War', 'Thriller', 'Crime', 'Mystery']

def prep_df()->pd.DataFrame:
    return pd.DataFrame(columns=['title', 'rating', 'genre', 'release_date', 'ranking',
                         'director', 'cast', 'budget', 'runtime', 'cum_worldwide_gross'])

def is_date(string):
    try: 
        parse(string)
        return True
    except ValueError:
        return False

def parse_dollars(x:str)->str:
    return ''.join(x.split('$')[1].split(' ')[0].strip().split(','))
    
def populate_df()->pd.DataFrame:
    df = prep_df()
    for link in movie_links:
        html = requests.get(link).text
        soup = BeautifulSoup(html, 'lxml')
        title=rating=genre=release_date=ranking=director=cast=budget=runtime=cum_worldwide_gross=np.nan
        main_top = soup.find('div', {'id': 'main_top'})
        main_bottom = soup.find('div', {'id': 'main_bottom'})
        title = main_top.h1.text.split('(')[0]
        info_bar = main_top.find('div', {'class': 'subtext'}).text.split('|')
        for element in info_bar:
            if element.strip() in ["PG-13", 'R', "PG", 'G']:
                rating = element.strip()
            elif element.strip().split(',')[0] in genres: 
                genre = ';'.join(element.strip().split(','))
            elif is_date(element.strip().split('(')[0].strip()):
                release_date = element.strip().split('(')[0].strip()
            else:
                continue
#             rating = element.strip() if element.strip() in ["PG-13", 'R', "PG", 'G'] else np.nan
#             genre = ';'.join(element.strip().split(',')) if element.strip().split(',')[0] in ['Action', 'Adventure', 'Sci-Fi', 'Family'] else np.nan 
#             release_date = element.strip().split('(')[0].strip() if is_date(element.strip().split('(')[0].strip()) else np.nan
#         rating = info_bar[0].strip() if info_bar[0].strip() in ['PG', 'R'] else np.nan
#         genre = ';'.join(info_bar[2].strip().split(','))
        ranking_wrapper = main_top.find('div', {'class': 'ratings_wrapper'})
        ranking = ranking_wrapper.find('span', {'itemprop': 'ratingValue'}).text if ranking_wrapper != None else np.nan
        for item in main_top.findAll('div', {'class': 'credit_summary_item'}):
            if item.h4.text == 'Director:':
                director = item.a.text
        cast_members = list()
        for cast_mem in main_bottom.findAll('tr', {'class': ['odd', 'even']}):
            cast_members.append(cast_mem.find('td', {'class': False}).a.text.strip())
        cast = ';'.join(cast_members)
        for x in main_bottom.find('div', {'id':'titleDetails'}).findAll('div', {'class':'txt-block'}):
            try:
                if x.h4.text == 'Budget:':
                    budget = parse_dollars(x.text)
                elif 'Cumulative' in x.h4.text:
                    cum_worldwide_gross = parse_dollars(x.text)
                elif 'Runtime' in x.h4.text:
                    runtime = x.text.split()[1].strip()
            except:
                pass
        newRow = pd.Series({'title': title, 'rating': rating, 'genre': genre, 'release_date': release_date, 'ranking': ranking, 
                   'director': director, 'cast': cast, 'budget': budget, 'runtime': runtime, 
                   'cum_worldwide_gross': cum_worldwide_gross})
        df = df.append(newRow, ignore_index=True)
    return df

df = populate_df()

In [5]:
df.head(30)

Unnamed: 0,title,rating,genre,release_date,ranking,director,cast,budget,runtime,cum_worldwide_gross
0,Aquaman,PG,Action; \nAdventure; \nFantasy,21 December 2018,7.6,James Wan,Jason Momoa;Amber Heard;Willem Dafoe;Patrick W...,160000000.0,143.0,556815000.0
1,Spider-Man: Into the Spider-Verse,PG,,14 December 2018,8.7,,Shameik Moore;Jake Johnson;Hailee Steinfeld;Ma...,90000000.0,117.0,138049916.0
2,Bumblebee,PG,Action; \nAdventure; \nSci-Fi,21 December 2018,7.3,Travis Knight,Hailee Steinfeld;Jorge Lendeborg Jr.;John Cena...,135000000.0,114.0,65353863.0
3,Bird Box,,Drama; \nHorror; \nSci-Fi,21 December 2018,6.8,Susanne Bier,Sandra Bullock;Trevante Rhodes;John Malkovich;...,,124.0,
4,Mary Poppins Returns,G,,19 December 2018,7.3,Rob Marshall,Emily Blunt;Lin-Manuel Miranda;Ben Whishaw;Emi...,130000000.0,130.0,72040263.0
5,Hellboy,,Action; \nAdventure; \nFantasy,12 April 2019,,Neil Marshall,David Harbour;Milla Jovovich;Sasha Lane;Penelo...,,,
6,Roma,PG,Drama,21 November 2018,8.3,Alfonso Cuarón,Yalitza Aparicio;Marina de Tavira;Diego Cortin...,,135.0,
7,Mortal Engines,PG,Action; \nAdventure; \nFantasy,14 December 2018,6.4,Christian Rivers,Hera Hilmar;Robert Sheehan;Hugo Weaving;Jihae;...,100000000.0,128.0,55215000.0
8,Creed II,PG,Drama; \nSport,21 November 2018,7.8,Steven Caple Jr.,Michael B. Jordan;Sylvester Stallone;Tessa Tho...,50000000.0,130.0,137944327.0
9,The Mule,,Crime; \nDrama; \nMystery,14 December 2018,7.2,Clint Eastwood,Clint Eastwood;Bradley Cooper;Taissa Farmiga;A...,50000000.0,116.0,
