# Web Scrapping Top 1000 movies IMDB

Web Scrapping Imports and code

In [1]:
#packages needed
import numpy as np
import pandas as pd
import requests
from requests import get
from bs4 import BeautifulSoup

#imported to add more human behavior
from time import sleep
from random import randint

headers = {'Accept-Language': 'en-US,en;q=0.5'}

#empty list containers 
metascores = []
gross_total = []
years = []
titles = []
imdb_ratings = []
actors = []
genres = []
directors = []
movie_ratings = []
votes = []


#Setting range
pages = np.arange(1, 1001, 50)

#Initial for loop to go through webpages set up with time delays
for page in pages:
    #setting each loop to be the url
    page = requests.get("https://www.imdb.com/search/title/?groups=top_1000&start=" + str(page) + "&ref_=adv_nxt", headers=headers)
    #using soup to parse through each new url
    soup = BeautifulSoup(page.text, 'html.parser')
    #initial container for each movie
    movie_div = soup.find_all('div', class_='lister-item mode-advanced')
    #created random intervals of waiting before next url is loaded
    sleep(randint(2,10))
    #loops through each container in each page
    for cont in movie_div:
        #setting each variable to push into the empty list
        movie_title = cont.h3.a.text
        titles.append(movie_title)
        
        movie_year = cont.h3.find('span', class_='lister-item-year').text
        years.append(movie_year)
        
        movie_rating = cont.p.find('span', class_='certificate').text if cont.p.find('span', class_='certificate') else  ''
        movie_ratings.append(movie_rating)
        
        movie_genre = cont.p.find('span', class_='genre').text if cont.p.find('span', class_='genre') else ''
        genres.append(movie_genre)
        
        imdb_rating = float(cont.strong.text)
        imdb_ratings.append(imdb_rating)
        
        metascore = cont.find('span', class_= 'metascore').text if cont.find('span', class_='metascore') else ''
        metascores.append(metascore)
        
        cast_and_crew = cont.findAll('p', class_='')[0].findAll('a')
        directors.append((cast_and_crew[0]).text)
        for x in cast_and_crew:
            actor_list = []
            if cast_and_crew.index(x) > 0:
                actor_list.append(x.text)
            actors.append(actor_list)
        
        val = cont.find_all('span', attrs={'name':'nv'})
        
        vote = val[0].text
        votes.append(vote)
        
        grosses = val[1].text if len(val) > 1 else ''
        gross_total.append(grosses)
#creating dataframes by turning the list into series
movies = pd.DataFrame({
    'Movie_name' : titles,
    'Year' : years,
    'Metascore': metascores,
    'IMDB_score' : imdb_ratings,
    'Movie_rating' : movie_ratings,
    'Votes' : votes,
    'Domestic_gross' : gross_total,
#     'Actors' : actors,
    'Director' : directors,
    'Genres' : genres
})

In [2]:
movies.head()

Unnamed: 0,Movie_name,Year,Metascore,IMDB_score,Movie_rating,Votes,Domestic_gross,Director,Genres
0,The Trial of the Chicago 7,(2020),76,7.9,R,52481,,Aaron Sorkin,"\nDrama, History, Thriller"
1,Halloween,(1978),87,7.8,R,229905,$47.00M,John Carpenter,"\nHorror, Thriller"
2,Tenet,(2020),69,7.8,PG-13,143680,$53.80M,Christopher Nolan,"\nAction, Sci-Fi"
3,The Untouchables,(1987),79,7.9,R,276672,$76.27M,Brian De Palma,"\nCrime, Drama, Thriller"
4,The Nightmare Before Christmas,(1993),82,8.0,PG,293203,$75.08M,Henry Selick,"\nAnimation, Family, Fantasy"


In [3]:
movies['Votes'] = movies['Votes'].str.replace(',', '').astype(int) # take out commas  and make into integers
#taking out commas and turning into integers

In [4]:
movies.loc[:, 'Year'] = movies['Year'].str[-5:-1].astype(int)
#taking out parenthesis and turning into integers

In [6]:
movies.loc[movies['Movie_rating'] == 'GP', 'Movie_rating'] = 'G'

In [7]:
movies['Domestic_gross'] = movies['Domestic_gross'].map(lambda x: x.lstrip('$').rstrip('M')) 
# strip the characters
movies['Domestic_gross'] = pd.to_numeric(movies['Domestic_gross'], errors='coerce')
# turn into a number 


In [8]:
movies["Genres"] = movies["Genres"].map(lambda x: x.lstrip('\n'))
#removing the begining characters in Genres

In [9]:
print(movies.dtypes)

Movie_name         object
Year                int64
Metascore          object
IMDB_score        float64
Movie_rating       object
Votes               int64
Domestic_gross    float64
Director           object
Genres             object
dtype: object


In [10]:
movies['Metascore'] = movies['Metascore'].str.extract('(\d+)') 
#grabbing the value of each metascore 
movies['Metascore'] = pd.to_numeric(movies['Metascore'], errors='coerce')
#and changing the value to be a float

In [11]:
movies.to_csv('IMDB_Top_1000.csv')

In [12]:
imdb_movies = pd.read_csv("IMDB_TOP_1000.csv")

In [13]:
imdb_movies

Unnamed: 0.1,Unnamed: 0,Movie_name,Year,Metascore,IMDB_score,Movie_rating,Votes,Domestic_gross,Director,Genres
0,0,The Trial of the Chicago 7,2020,76.0,7.9,R,52481,,Aaron Sorkin,"Drama, History, Thriller"
1,1,Halloween,1978,87.0,7.8,R,229905,47.00,John Carpenter,"Horror, Thriller"
2,2,Tenet,2020,69.0,7.8,PG-13,143680,53.80,Christopher Nolan,"Action, Sci-Fi"
3,3,The Untouchables,1987,79.0,7.9,R,276672,76.27,Brian De Palma,"Crime, Drama, Thriller"
4,4,The Nightmare Before Christmas,1993,82.0,8.0,PG,293203,75.08,Henry Selick,"Animation, Family, Fantasy"
...,...,...,...,...,...,...,...,...,...,...
995,995,Vizontele,2001,,8.0,,32723,,Yilmaz Erdogan,"Comedy, Drama"
996,996,The Breath,2009,,8.0,,31460,,Levent Semerci,"Action, Drama, Thriller"
997,997,Andaz Apna Apna,1994,,8.2,PG,48727,,Rajkumar Santoshi,"Comedy, Romance"
998,998,Drishyam,2013,,8.3,Not Rated,30131,,Jeethu Joseph,"Crime, Drama, Thriller"
