### Import packages

In [1]:
import requests as req
from bs4 import BeautifulSoup as bs
from random import randint
import concurrent.futures
import time
import pandas as pd
import numpy as np

In [2]:
# I define the header to get the text in a specifique language. in my case us english
headers = {"Accept-Language": "en-US, en;q=0.5"}

## Create a function to get pages' url

In [3]:
# Create a list of url to get the url of 40pages containing 10thousand movies
pages = [str(i) for i in range(1,10000,250)]

In [4]:
urls = []
for page in pages:
    p_url = 'https://www.imdb.com/search/title/?at=0&num_votes=5000,&sort=user_rating,desc&count=250&start=' + page + '&title_type=feature'
            
    urls.append(p_url)

In [5]:
# this represents the total number of pages nowing that each page contains 250 movies
len(urls)

40

## Create a function to get movies' url

In [6]:
def fetchmovies_link(url):
    
    ''' the fetchmovies_link function retrieves all the movies
    link in each pages url is the object representing a page link '''
    
    response = req.get(url, headers=headers)
    html = bs(response.text, 'html.parser')
    movies = html.find_all('div', {'class':'lister-item mode-advanced'})

    baseurl ='https://www.imdb.com'
#     movies_link = []
    
    for movie in movies:

        url1 = baseurl + movie.find('a')['href']
        movies_link.append(url1)
        
    return(movies_link)

#### Use of concurrence to speed up my fetchmovies_link function

In [7]:
# t1 determines the time at start and t2 the end time for each thread and thier difffernece gives us the total computation time

t1 = time.perf_counter()

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(fetchmovies_link, urls)
    
#     we obtain our results by appling a loop and storing them in a list
    movies_link = [] 
    
#     for result in results:
#         links.append(result)
        
t2 = time.perf_counter()

print(f'finished in {t2 - t1} seconds')

finished in 54.6239767 seconds


In [8]:
# Check if we have 10000 movie's links for for all the 40 pages
len(movies_link)

10000

In [9]:
# check if this actually gives out a link and compare on the web side.
movies_link[0]

'https://www.imdb.com/title/tt0070511/'

In [10]:
# check for duplicates
list_movieslink = pd.DataFrame(movies_link, columns = ['links'])
list_movieslink.nunique()

links    10000
dtype: int64

## Create a function to get our data

In [64]:
def data(movies_link):
#     imdb = []
    
    imdb_data = {'Name' : None,
                'Directors' : [],
                'Writers': [],
                'Imdb_rating' : None,
                'Score' : None,
                'Genres' : [],
                'Votes' : None,
                'Runtime' : None,
                'Country' : None,
                'Release_date' : None,
                'Budget' : None,
                'Opening_weekend' : None,
                'Gross_USA' : None,
                'Worldwide_gross' : None,
                'Runtime' : None,
                'Sound_mix' : None,
                'Color' : None,
                'Aspect_ratio' : None,
                'Film_location' : [],
                'Summary' : None,
                'User_review': None,
                'Critic_review': None}
    
    
#     imdb_data = imdb_data_b.copy() 
    

    
    response = req.get(url = movies_link, headers = headers)
    html = bs(response.text, 'html.parser')
    
    body = html.find('div', class_='pagecontent')
    
    genres = body.find_all('div', {'class':'see-more inline canwrap'})
    
    imdb_data['Summary'] = body.find('div', class_='summary_text').text.strip()
    
    for gen in genres:
        if gen.h4.text == 'Genres:':
            genres_all = gen.find_all('a')
            for gen_all in genres_all:
                imdb_data['Genres'].append(gen_all.text) 
    
    imdb_data['Name'] = body.find('div', class_ = 'title_wrapper').h1.text
    

    scores = body.find('div', class_ = 'metacriticScore score_favorable titleReviewBarSubItem')
    if not scores:
        imdb_data['Score'] = 'xxx'
    else:
        imdb_data['Score'] = float(scores.span.text)
        
    imdb_data['Imdb_rating'] = float(body.find('span', {'itemprop' : 'ratingValue'}).text)
    imdb_data['Votes'] = int(body.find('span', {'class' : 'small'}).text.replace(',', ''))
    
    
    oscar = body.find('span', {'class' : 'awards-blurb'})
    if not oscar:
        imdb_data['Oscars'] = 'xxx'
    else:
        if ('Oscars' in oscar.text) & ('Won' in oscar.text):
            imdb_data['Oscars'] = int(oscar.text[25:31])
        else:
            imdb_data['Oscars'] = 'xxx'
            
    review = body.find('div', class_ = 'titleReviewBarItem titleReviewbarItemBorder')
    imdb_data['User_review'] = review.a.text[:-4].replace(',','')
    
    if not review.find_all('a')[1].text[:-6].replace(',',''):
        imdb_data['Critic_review'] = 'xxx'
    else:
        imdb_data['Critic_review'] = review.find_all('a')[1].text[:-6].replace(',','')            
    subtext = body.find_all('div', {'class' : 'titleReviewBarItem'})
    
#     for sub in subtext:
#         if not sub.find('div', class_ = ''):
#             imdb_data['Popularity'] = 'xxx'
#         else:
#             pop = sub.find('div', class_ = '').text.strip()
#             if pop == 'Popularity':
#                 imdb_data['Popularity'] = int(sub.find('span', class_ =  'subText').text[:24])
                
# #             elif pop == 'Reviews':
# #                 rev = sub.find('span', class_ =  'subText').find_all('a')
# #                 imdb_data['User_review'] = int(rev[0].text[:-4].replace(',',''))
# #                 if not rev[1]:
# #                     imdb_data['Critic_review'] = 'xxx'
# #                 else:
# #                     imdb_data['Critic_review'] = int(rev[1].text[:-6].replace(',',''))            

        
        
    teams = body.find_all('div', class_ = 'credit_summary_item')

    for team in teams:
        
        if not team.h4:
            crew = None
            continue
        else :
            crew = team.h4.text.strip()
        if crew == 'Directors:':
            dires = team.find_all('a')
            for dire in dires:
                imdb_data['Directors'].append(dire.text)
        elif crew == 'Writers:':
            writs = team.find_all('a')
            for writ in writs:
                imdb_data['Writers'].append(writ.text)


    details = body.find('div', {'class' : 'article', 'id' : 'titleDetails'})
    infs = details.find_all('div', class_ = 'txt-block')


    for inf in infs:   
        if not inf.h4:
            det = None
#             continue
        else :
            det = inf.h4.text.strip()
        if det == 'Country:':
            imdb_data['Country'] = inf.a.text
        elif det == 'Release Date:':
            imdb_data['Release_date'] = inf.contents[2].strip()
        elif det == 'Budget:':
            imdb_data['Budget'] = inf.contents[2].strip()
        elif det == 'Opening Weekend USA:':
            imdb_data['Opening_weekend'] = inf.contents[2].strip()
        elif det == 'Gross USA:':
            imdb_data['Gross_USA'] = inf.contents[2].strip()
        elif det == 'Cumulative Worldwide Gross:':
            imdb_data['Worldwide_gross'] = inf.contents[2].strip()
        elif det == 'Runtime:':
            imdb_data['Runtime'] = inf.time.text.replace('min','').strip()
        elif det == 'Sound Mix:':
            imdb_data['Sound_mix'] = inf.a.text.strip()
        elif det == 'Color:':
            imdb_data['Color'] = inf.a.text.strip()
        elif det == 'Aspect Ratio:':
            imdb_data['Aspect_ratio'] = inf.contents[2].strip()        
        elif det == 'Filming Locations:':
            if not inf.find('span', class_ = 'see-more inline'):
                fil_loc = inf.a
                imdb_data['Film_location'] = fil_loc.text

            else:
                see_more = inf.find('span', class_ = 'see-more inline')

                
                loc_url = movies_link + see_more.find('a')['href']
                response2 = req.get(loc_url, headers = headers)
                html2 = bs(response2.text, 'html.parser')
                tab_loc = html2.find('section',{'id':'filming_locations'})                                          #parser ralenti le code d'une seconde
                locs = tab_loc.find_all('dt')

                for loc in locs:
                    imdb_data['Film_location'].append(loc.a.text[:-1])
                    
    imdb.append(imdb_data)
    return(imdb)

#### Use of concurrence to speed up my fetchmovies_link function

In [67]:
t1 = time.perf_counter()

with concurrent.futures.ThreadPoolExecutor() as executor:
    results1 = executor.map(data, movies_link)
    imdb = []
        
t2 = time.perf_counter()

In [84]:
print(f'''finished scrapping {len(imdb)} movie(s) in {t2 - t1} seconds({(t2 - t1)/60} minute(s))
that is {(t2 - t1)/len(imdb)} second to scrap one movie and 
this means {len(imdb)/(t2 - t1)} movie(s) is scrapped every second''')

finished scrapping 9942 movie(s) in 2250.8200889 seconds(37.51366814833333 minute(s))
that is 0.2263951004727419 second to scrap one movie and 
this means 4.417056720361316 movie(s) is scrapped every second


In [68]:
# check if we still have 10thousand indexs
len(imdb)

9942

In [70]:
Data_imdb = pd.DataFrame(imdb)

In [98]:
Data_imdb.shape

(9942, 22)

In [72]:
Data_imdb.to_csv('Data_imdb.csv')