In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json

soup = BeautifulSoup()
from urllib.request import urlopen


In [2]:
def individual_url(year,search_number):
    '''
    formats url with year and search number
    each url gives at least 50 items
    example: https://www.imdb.com/search/title/?title_type=feature&year=2006-01-01,2006-12-31&start=1&ref_=adv_nxt
    '''
    _string = 'https://www.imdb.com/search/title/?title_type=feature&year={}-01-01,{}-12-31&start={}&ref_=adv_nxt'.format(year,year,search_number)
    
    return _string

In [3]:
def search_list(list_of_years,list_of_searches):
    '''
    returns a list of tuple (individual_url, year)
    takes a list of years and list of searches to compile a list of urls using individual_url function
    
    example: https://www.imdb.com/search/title/?title_type=feature&year=LIST_OF_YEARS&start=LIST_OFSEARCHES&ref_=adv_nxt
    _list = [list of urls]
    '''
    search_list = []
    for year in list_of_years:
        for i in list_of_searches:
            search_list.append((individual_url(year,i),year))
    return search_list
    
    

In [4]:
time_search = search_list([2004],[1])
time_search

[('https://www.imdb.com/search/title/?title_type=feature&year=2004-01-01,2004-12-31&start=1&ref_=adv_nxt',
  2004)]

In [5]:
def list_individual_urls(list_of_years,list_of_searches):
    '''
    returns list of (individual urls, title, date)
    ATTENTION! list_of_searches MUST start at 1 and be increments of 50(ex. 1,51,101...) or else DUPLICATES
    '''
    
    _list = []
    # create a list of url searches
    _searched = search_list(list_of_years,list_of_searches)
    
    for link, year in _searched:
        uClient = urlopen(link)
        page_html = uClient.read()
        uClient.close()
        
        page_soup = BeautifulSoup(page_html, 'html.parser')
        containers = page_soup.findAll("h3",{"class":"lister-item-header"})
        for container in containers:
            _string = 'https://www.imdb.com' + container.a['href']
            _list.append((_string,container.a.text, year))
            
    return _list

        

    
    

In [6]:
import re
from tqdm import tqdm

In [7]:
def scraped_df(list_of_years,list_of_searches):
    '''
    returns a merged dataframe from list of individual urls AND items from EACH individual URL
    '''
    #creating first dataframe of urls, title, year
    _list_url = pd.DataFrame(list_individual_urls(list_of_years, list_of_searches))
    _list_url.columns = ['url','title','year']
    
    #creating second dataframe of urls, genre, gross_usa, budget, worldwide_gross
    _individual = []
    for link in tqdm(_list_url['url']):
        uClient = urlopen(link)
        page_html = uClient.read()
        uClient.close()
        
        page_soup = BeautifulSoup(page_html, 'html.parser')
        
        
        # FROM HERE ON: we execute multiple 'try's incase regex can not find the matched criteria. If not found return empty np.nan.
        # try: something, except(error): ''
        con_genre = page_soup.findAll("div",{"class":"see-more inline canwrap"})
        _genre = []
        try:
            for i in con_genre[-1].findAll('a'):
                _genre.append(i.text.strip(' '))
        except:
            _genre = np.nan
            
        con_info = page_soup.findAll("div",{"class":"txt-block"})
        str_con_info = str(con_info)
        _str_con_info = str_con_info.replace('\n',' ')
        
        _gross_USA = ''
        try:
            _gross_USA = re.findall(r'(?<=Gross USA:<\/h4>).\S*',_str_con_info)[0].strip(' ')
        except:
            _gross_USA = np.nan
            
        _budget = ''
        try:
            _budget = re.findall(r'(?<=Budget:<\/h4>).\S*',_str_con_info)[0].strip(' ')
        except:
            _budget = np.nan
            
            
        _worldwide_gross = ''
        try:
            _worldwide_gross = re.findall(r'(?<=Worldwide Gross:<\/h4>).\S*',_str_con_info)[0].strip(' ')
        except:
            _worldwide_gross = np.nan
            
        _pg_rated = ''
        try:
            _pg_rated = re.findall(r'(?<=<span>Rated).\S*',_str_con_info)[0].strip(' ')
        except:
            _pg_rated = np.nan
            
            
        con_rating = page_soup.findAll("div",{"class":"ratingValue"})
        _rating = ''
        try:
            _rating = con_rating[0].strong.text
        except:
            _rating = np.nan
            
        _rating_pop = ''
        try:
            _rating_pop_demo = page_soup.findAll("div",{"class":"ratingValue"})[0].strong['title']
            _rating_pop = re.findall(r'(?<=based on ).\S*',_rating_pop_demo)[0]
        except:
            _rating_pop = np.nan
            
        
            
            
        _individual.append((link,_genre,_gross_USA,_budget,_worldwide_gross,_pg_rated,_rating,_rating_pop))
            
            
    _individual_df = pd.DataFrame(_individual)
    _individual_df.columns = ['url','genre','gross_usa','budget','worldwide_gross','pg_rated','rating','rating_pop']
    
    _merged = pd.merge(_list_url,_individual_df,on='url',how='outer')
    
    return _merged
        
        
        
    

In [21]:
sample_albert = scraped_df([2004],[1])
sample_albert

100%|██████████| 50/50 [00:43<00:00,  1.16it/s]


Unnamed: 0,url,title,year,genre,gross_usa,budget,worldwide_gross,pg_rated,rating,rating_pop
0,https://www.imdb.com/title/tt0377092/,Mean Girls,2004,[Comedy],"$86,058,055","$17,000,000","$130,125,829",PG-13,7.0,321651
1,https://www.imdb.com/title/tt0332280/,The Notebook,2004,"[Drama, Romance]","$81,001,787","$29,000,000","$115,882,795",PG-13,7.8,503796
2,https://www.imdb.com/title/tt0304141/,Harry Potter and the Prisoner of Azkaban,2004,"[Adventure, Family, Fantasy, Mystery]","$249,975,996","$130,000,000","$799,972,094",PG,7.9,528417
3,https://www.imdb.com/title/tt0347149/,Howl's Moving Castle,2004,"[Animation, Adventure, Family, Fantasy]","$5,576,743","$24,000,000","$236,212,992",PG,8.2,313884
4,https://www.imdb.com/title/tt0364725/,Dodgeball,2004,"[Comedy, Sport]","$114,326,736","$20,000,000","$168,423,227",PG-13,6.7,224226
5,https://www.imdb.com/title/tt0332452/,Troy,2004,"[Drama, History]","$133,378,256","$175,000,000","$497,409,852",R,7.2,476927
6,https://www.imdb.com/title/tt0338013/,Eternal Sunshine of the Spotless Mind,2004,"[Drama, Romance, Sci-Fi]","$34,400,301","$20,000,000","$74,036,715",R,8.3,882383
7,https://www.imdb.com/title/tt0349903/,Ocean's Twelve,2004,"[Crime, Thriller]","$125,544,280","$110,000,000","$362,744,280",PG-13,6.5,349265
8,https://www.imdb.com/title/tt0265208/,The Girl Next Door,2004,"[Comedy, Drama, Romance]","$14,589,444","$25,000,000","$30,381,722",R,6.7,197437
9,https://www.imdb.com/title/tt0381707/,White Chicks,2004,"[Comedy, Crime]","$70,831,760","$37,000,000","$113,100,873",PG-13,5.6,126898


In [11]:
_2011TO2020 = list(range(2011,2021))

In [12]:
_top300 = [1,51,101,151,201,251]

In [13]:
_2011_2020 = scraped_df(_2011TO2020,_top300)


  0%|          | 0/3000 [00:00<?, ?it/s]
  0%|          | 1/3000 [00:00<44:32,  1.12it/s]
  0%|          | 2/3000 [00:01<45:24,  1.10it/s]
  0%|          | 3/3000 [00:03<50:22,  1.01s/it]
  0%|          | 4/3000 [00:04<51:15,  1.03s/it]
  0%|          | 5/3000 [00:05<50:20,  1.01s/it]
  0%|          | 6/3000 [00:06<52:22,  1.05s/it]
  0%|          | 7/3000 [00:07<48:15,  1.03it/s]
  0%|          | 8/3000 [00:08<48:59,  1.02it/s]
  0%|          | 9/3000 [00:09<59:45,  1.20s/it]
  0%|          | 10/3000 [00:10<56:20,  1.13s/it]
  0%|          | 11/3000 [00:11<56:08,  1.13s/it]
  0%|          | 12/3000 [00:12<52:29,  1.05s/it]
  0%|          | 13/3000 [00:13<50:14,  1.01s/it]
  0%|          | 14/3000 [00:14<53:40,  1.08s/it]
  0%|          | 15/3000 [00:15<50:02,  1.01s/it]
  1%|          | 16/3000 [00:16<51:49,  1.04s/it]
  1%|          | 17/3000 [00:17<51:28,  1.04s/it]
  1%|          | 18/3000 [00:18<50:00,  1.01s/it]
  1%|          | 19/3000 [00:19<48:19,  1.03it/s]
  1%|          | 

In [14]:
_2011_2020.to_csv('2011_2020.csv',header=True,index=False)