In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json

soup = BeautifulSoup()
from urllib.request import urlopen


In [2]:
def individual_url(year,search_number):
    '''
    formats url with year and search number
    each url gives at least 50 items
    example: https://www.imdb.com/search/title/?title_type=feature&year=2006-01-01,2006-12-31&start=1&ref_=adv_nxt
    '''
    _string = 'https://www.imdb.com/search/title/?title_type=feature&year={}-01-01,{}-12-31&start={}&ref_=adv_nxt'.format(year,year,search_number)
    
    return _string

In [3]:
def search_list(list_of_years,list_of_searches):
    '''
    returns a list of tuple (individual_url, year)
    takes a list of years and list of searches to compile a list of urls using individual_url function
    
    example: https://www.imdb.com/search/title/?title_type=feature&year=LIST_OF_YEARS&start=LIST_OFSEARCHES&ref_=adv_nxt
    _list = [list of urls]
    '''
    search_list = []
    for year in list_of_years:
        for i in list_of_searches:
            search_list.append((individual_url(year,i),year))
    return search_list
    
    

In [4]:
time_search = search_list([2004],[1])
time_search

[('https://www.imdb.com/search/title/?title_type=feature&year=2004-01-01,2004-12-31&start=1&ref_=adv_nxt',
  2004)]

In [5]:
def list_individual_urls(list_of_years,list_of_searches):
    '''
    returns list of (individual urls, title, date)
    ATTENTION! list_of_searches MUST start at 1 and be increments of 50(ex. 1,51,101...) or else DUPLICATES
    '''
    
    _list = []
    # create a list of url searches
    _searched = search_list(list_of_years,list_of_searches)
    
    for link, year in _searched:
        uClient = urlopen(link)
        page_html = uClient.read()
        uClient.close()
        
        page_soup = BeautifulSoup(page_html, 'html.parser')
        containers = page_soup.findAll("h3",{"class":"lister-item-header"})
        for container in containers:
            _string = 'https://www.imdb.com' + container.a['href']
            _list.append((_string,container.a.text, year))
            
    return _list

        

    
    

In [6]:
import re
from tqdm import tqdm

In [7]:
def scraped_df(list_of_years,list_of_searches):
    '''
    returns a merged dataframe from list of individual urls AND items from EACH individual URL
    '''
    #creating first dataframe of urls, title, year
    _list_url = pd.DataFrame(list_individual_urls(list_of_years, list_of_searches))
    _list_url.columns = ['url','title','year']
    
    #creating second dataframe of urls, genre, gross_usa, budget, worldwide_gross
    _individual = []
    for link in tqdm(_list_url['url']):
        uClient = urlopen(link)
        page_html = uClient.read()
        uClient.close()
        
        page_soup = BeautifulSoup(page_html, 'html.parser')
        
        
        # FROM HERE ON: we execute multiple 'try's incase regex can not find the matched criteria. If not found return empty string.
        # try: something, except(error): ''
        con_genre = page_soup.findAll("div",{"class":"see-more inline canwrap"})
        _genre = []
        try:
            for i in con_genre[-1].findAll('a'):
                _genre.append(i.text.strip(' '))
        except:
            _genre = []
            
        con_info = page_soup.findAll("div",{"class":"txt-block"})
        str_con_info = str(con_info)
        _str_con_info = str_con_info.replace('\n',' ')
        
        _gross_USA = ''
        try:
            _gross_USA = re.findall(r'(?<=Gross USA:<\/h4>).\S*',_str_con_info)[0].strip(' ')
        except:
            _gross_USA = ''
            
        _budget = ''
        try:
            _budget = re.findall(r'(?<=Budget:<\/h4>).\S*',_str_con_info)[0].strip(' ')
        except:
            _budget = ''
            
            
        _worldwide_gross = ''
        try:
            _worldwide_gross = re.findall(r'(?<=Worldwide Gross:<\/h4>).\S*',_str_con_info)[0].strip(' ')
        except:
            _worldwide_gross = ''
            
        _individual.append((link,_genre,_gross_USA,_budget,_worldwide_gross))
            
            
    _individual_df = pd.DataFrame(_individual)
    _individual_df.columns = ['url','genre','gross_usa','budget','worldwide_gross']
    
    _merged = pd.merge(_list_url,_individual_df,on='url',how='outer')
    
    return _merged
        
        
        
    

In [8]:
sample_albert = scraped_df([2004],[1])
sample_albert

100%|██████████| 50/50 [00:40<00:00,  1.23it/s]


Unnamed: 0,url,title,year,genre,gross_usa,budget,worldwide_gross
0,https://www.imdb.com/title/tt0377092/,Mean Girls,2004,[Comedy],"$86,058,055","$17,000,000","$130,125,829"
1,https://www.imdb.com/title/tt0332280/,The Notebook,2004,"[Drama, Romance]","$81,001,787","$29,000,000","$115,882,795"
2,https://www.imdb.com/title/tt0304141/,Harry Potter and the Prisoner of Azkaban,2004,"[Adventure, Family, Fantasy, Mystery]","$249,975,996","$130,000,000","$799,972,094"
3,https://www.imdb.com/title/tt0347149/,Howl's Moving Castle,2004,"[Animation, Adventure, Family, Fantasy]","$5,576,743","$24,000,000","$236,212,992"
4,https://www.imdb.com/title/tt0364725/,Dodgeball,2004,"[Comedy, Sport]","$114,326,736","$20,000,000","$168,423,227"
5,https://www.imdb.com/title/tt0332452/,Troy,2004,"[Drama, History]","$133,378,256","$175,000,000","$497,409,852"
6,https://www.imdb.com/title/tt0338013/,Eternal Sunshine of the Spotless Mind,2004,"[Drama, Romance, Sci-Fi]","$34,400,301","$20,000,000","$74,036,715"
7,https://www.imdb.com/title/tt0349903/,Ocean's Twelve,2004,"[Crime, Thriller]","$125,544,280","$110,000,000","$362,744,280"
8,https://www.imdb.com/title/tt0265208/,The Girl Next Door,2004,"[Comedy, Drama, Romance]","$14,589,444","$25,000,000","$30,381,722"
9,https://www.imdb.com/title/tt0381707/,White Chicks,2004,"[Comedy, Crime]","$70,831,760","$37,000,000","$113,100,873"


In [9]:
sample_albert['genre'][0]

['Comedy']

In [10]:
sample_albert['budget'][0]

'$17,000,000'

In [11]:
df = pd.DataFrame(list_individual_urls([2004],[1]))
df.columns = ['url','title','year']
df

Unnamed: 0,url,title,year
0,https://www.imdb.com/title/tt0377092/,Mean Girls,2004
1,https://www.imdb.com/title/tt0332280/,The Notebook,2004
2,https://www.imdb.com/title/tt0304141/,Harry Potter and the Prisoner of Azkaban,2004
3,https://www.imdb.com/title/tt0347149/,Howl's Moving Castle,2004
4,https://www.imdb.com/title/tt0364725/,Dodgeball,2004
5,https://www.imdb.com/title/tt0332452/,Troy,2004
6,https://www.imdb.com/title/tt0338013/,Eternal Sunshine of the Spotless Mind,2004
7,https://www.imdb.com/title/tt0349903/,Ocean's Twelve,2004
8,https://www.imdb.com/title/tt0265208/,The Girl Next Door,2004
9,https://www.imdb.com/title/tt0381707/,White Chicks,2004
