In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
import requests

In [2]:
# getting top-100 imdb movies

headers = {'Accept-Language': 'en-US, en; q=0.5'}

In [3]:
# referencing link to url variable

url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"

results = requests.get(url, headers=headers)

In [4]:
# html.parser - allows Python to read the components of the page rather than treating it as one long string
soup = bs(results.content, 'html.parser')

In [None]:
soup.prettify()

In [6]:
# initialize empty lists
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

In [7]:
# find all items with 'class' -  lister-item mode-advanced

movie_div = soup.find_all('div', class_='lister-item mode-advanced')

In [8]:
# initiate the for loop
for table in movie_div:
    
    # titles
    name = table.h3.a.text
    titles.append(name)
    # years
    year = table.h3.find('span', class_='lister-item-year').text
    years.append(year)
    # runtime
    runtime = table.find('span', class_='runtime').text if table.p.find('span', class_='runtime') else ''
    time.append(runtime)
    # imdb
    imdb = float(table.strong.text)
    imdb_ratings.append(imdb)
    # metascore
    m_score = table.find('span', class_='metascore').text if table.find('span', class_='metascore') else '-'
    metascores.append(m_score)
    # votes
    nv = table.find_all('span', attrs={'name': 'nv'})

    vote = nv[0].text
    votes.append(vote)
    # grosses
    grosses = nv[1].text if len(nv) > 1 else '-'
    us_gross.append(grosses) 

In [9]:
movies = pd.DataFrame({
'movie': titles,
'year': years,
'timeMin': time,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross,
})

# Data Cleaning

In [10]:
movies.head()

Unnamed: 0,movie,year,timeMin,imdb,metascore,votes,us_grossMillions
0,The Gentlemen,(2019),113 min,8.0,51,79312,-
1,Knives Out,(2019),131 min,8.0,82,252859,$165.36M
2,1917,(2019),119 min,8.4,78,259363,$159.18M
3,Parasite,(2019),132 min,8.6,96,343830,$53.37M
4,Once Upon a Time... in Hollywood,(2019),161 min,7.7,83,424490,$142.50M


In [11]:
# removing parenthesis in year column
movies.year = movies.year.str.extract('(\d+)').astype(int)

In [12]:
# removing chars in time column
movies['timeMin'] = movies['timeMin'].str.extract('(\d+)').astype(int)

In [13]:
movies.votes = movies.votes.str.replace(',', '').astype(int)

In [14]:
movies['us_grossMillions'] = movies['us_grossMillions'].map(lambda x: x.lstrip('$').rstrip('M'))


In [16]:
movies['us_grossMillions'] = pd.to_numeric(movies['us_grossMillions'], errors='coerce')

In [17]:
movies.to_csv('/users/lyanalexandr/movies.csv')