# TESTE DE WEB SCRAPPING 

Endereço web: https://www.imdb.com/search/title/?release_date=2019&sort=num_votes,desc&page=1

Capturar os seguintes campos em uma primeira rodada:

- Nome do filme
- Ano do filme
- Nota do filme
- Rating do filme
- Número de votantes

Depois desta primeira ação filtrar por ano e paginar por um determinado número de páginas (dados pelo usuário).

In [28]:
import pandas as pd
from bs4 import BeautifulSoup
from requests import get
from time import sleep
from random import randint

# Determinando a url

In [2]:
url = "https://www.imdb.com/search/title/?release_date=2019&sort=num_votes,desc&page=1"

In [3]:
get(url)

<Response [200]>

### Capturando a Url com GET

In [4]:
imdb = get(url).content

### Transformando o conteúdo em HTML

In [5]:
conteudo = BeautifulSoup(imdb, 'html.parser')

# Identificando a posição inicial dos filmes 

In [6]:
movie_containers = conteudo.find_all('div', class_ ='lister-item mode-advanced')

### Verificando quantos itens foram coletados

In [7]:
print(len(movie_containers))

50


# Testando as posições dos itens a serem coletados

In [8]:
first_movie=movie_containers[0]

In [9]:
first_movie.h3.a.text.strip() #nome do filme

'Vingadores: Ultimato'

In [10]:
first_movie.h3.find('span', class_='lister-item-year text-muted unbold').text.strip() #ano do filme

'(2019)'

In [11]:
float(first_movie.strong.text.strip()) #nota do filme

8.7

In [12]:
int(first_movie.find('span', class_ = 'metascore favorable').text.strip()) #rating do filme

78

In [13]:
int(first_movie.find('span', attrs={'name':'nv'})['data-value']) #número de votantes

463711

# Função para coletar os dados - sem paginação

In [40]:
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

for container in movie_containers:
    name = container.h3.a.text.strip()
    names.append(name)
    
    year = container.h3.find('span', class_='lister-item-year text-muted unbold').text.strip().replace("(","").replace(")","")
    years.append(year)
  
    imdb_rating = float(container.strong.text.strip())
    imdb_ratings.append(imdb_rating)
    
    vote = int(container.find('span', attrs={'name':'nv'})['data-value'])
    votes.append(vote)
        
    if container.find('div', class_ = 'ratings-metascore') is not None: 
        metascore = int(container.find('span', class_ = 'metascore').text.strip())
        metascores.append(metascore)
    else:
        metascore = 0
        metascores.append(metascore)
    
        

# Criando um DataFrame com os dados coletados

In [41]:
df = pd.DataFrame({'nome':names, 
                   'ano':years,
                   'score': imdb_ratings,
                   'metascore': metascores,
                   'votos':votes })

In [42]:
df.head(5)

Unnamed: 0,nome,ano,score,metascore,votos
0,Vingadores: Ultimato,2019,8.7,78,463711
1,Capitã Marvel,2019,7.0,64,315738
2,Chernobyl,2019,9.6,0,288233
3,Game of Thrones,2011–2019,4.2,0,207502
4,Game of Thrones,2011–2019,7.6,0,193703


# Paginando

### Determinando as quantidades de páginas e o anos

In [17]:
paginas = [str(i) for i in range(1,6)]
paginas

['1', '2', '3', '4', '5']

In [20]:
anos = [str(i) for i in range(2017, 2020)]
anos

['2017', '2018', '2019']

### Testando as variáveis com a URL

In [24]:
for y in anos:
    for page in paginas:
        response = get('https://www.imdb.com/search/title/?release_date=' + y + '&sort=num_votes,desc&page='+page)
        print(response, page, 'https://www.imdb.com/search/title/?release_date=' + y + '&sort=num_votes,desc&page='+page)

<Response [200]> 1 https://www.imdb.com/search/title/?release_date=2017&sort=num_votes,desc&page=1
<Response [200]> 2 https://www.imdb.com/search/title/?release_date=2017&sort=num_votes,desc&page=2
<Response [200]> 3 https://www.imdb.com/search/title/?release_date=2017&sort=num_votes,desc&page=3
<Response [200]> 4 https://www.imdb.com/search/title/?release_date=2017&sort=num_votes,desc&page=4
<Response [200]> 5 https://www.imdb.com/search/title/?release_date=2017&sort=num_votes,desc&page=5
<Response [200]> 1 https://www.imdb.com/search/title/?release_date=2018&sort=num_votes,desc&page=1
<Response [200]> 2 https://www.imdb.com/search/title/?release_date=2018&sort=num_votes,desc&page=2
<Response [200]> 3 https://www.imdb.com/search/title/?release_date=2018&sort=num_votes,desc&page=3
<Response [200]> 4 https://www.imdb.com/search/title/?release_date=2018&sort=num_votes,desc&page=4
<Response [200]> 5 https://www.imdb.com/search/title/?release_date=2018&sort=num_votes,desc&page=5
<Response 

# Coletando com as variáveis de páginas e anos

In [43]:
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

for y in anos:
    for page in paginas:
        
        response = get('https://www.imdb.com/search/title/?release_date=' + y + '&sort=num_votes,desc&page='+page)
        sleep(randint(10,30))
        page_html = BeautifulSoup(response.content, 'html.parser')
        movie_containers = conteudo.find_all('div', class_ ='lister-item mode-advanced')
        
        for container in movie_containers:
            name = container.h3.a.text.strip()
            names.append(name)

            year = container.h3.find('span', class_='lister-item-year text-muted unbold').text.strip().replace("(","").replace(")","")
            years.append(year)

            imdb_rating = float(container.strong.text.strip())
            imdb_ratings.append(imdb_rating)
            
            vote = int(container.find('span', attrs={'name':'nv'})['data-value'])
            votes.append(vote)
            
            if container.find('div', class_ = 'ratings-metascore') is not None:
                metascore = int(container.find('span', class_ = 'metascore').text.strip())
                metascores.append(metascore)
            else:
                metascore = 0
                metascores.append(metascore)
            

# Criando o DataFrame Paginado

In [44]:
df_paginado = pd.DataFrame({'nome':names,
                   'ano':years, 
                   'imdb_ratings': imdb_ratings, 
                   'metascore': metascores,
                   'qtd_votos':votes
                  })

In [45]:
df_paginado.head()

Unnamed: 0,nome,ano,imdb_ratings,metascore,qtd_votos
0,Vingadores: Ultimato,2019,8.7,78,463711
1,Capitã Marvel,2019,7.0,64,315738
2,Chernobyl,2019,9.6,0,288233
3,Game of Thrones,2011–2019,4.2,0,207502
4,Game of Thrones,2011–2019,7.6,0,193703


In [46]:
df_paginado.size

3750