In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

In [None]:
url_imdb = 'http://www.imdb.com/title/tt0451279/?ref_=nv_sr_1'
url_tomates = 'https://www.rottentomatoes.com/m/wonder_woman_2017'
url_filmow = 'https://filmow.com/mulher-maravilha-t48706/'

# IMDb

In [None]:
# Get the page content and set up a new parser
response = requests.get(url_imdb)
content = response.content
soup = BeautifulSoup(content, 'html.parser')

# Get title
title = soup.find_all("h1", itemprop="name")[0].text
print("Title: " + title)

# Get rating
rating = soup.find_all("span", itemprop="ratingValue")[0].text
userRating = soup.find_all("span", itemprop="ratingCount")[0].text
print("Rating: " + rating + "/10 (" + userRating + " users rating)")

# Get time
time = soup.find_all("time", itemprop="duration")[0].text
print("Time: " + time.strip())

# Get genres
genres = soup.find_all("span", itemprop="genre", class_="itemprop")
g = ""
for genre in genres:
    g = g + genre.text + ", "
print("Genres: " + g[0:-2])

# Rotten tomates

In [None]:
# Get the page content and set up a new parser
response = requests.get(url_tomates)
content = response.content
soup = BeautifulSoup(content, 'html.parser')

# Get title
title = soup.find_all("h1", class_="title hidden-xs")[0].text
print("Title: " + title.strip())

### TOMATOMETER
print("\n----- TOMATOMETER -----")
# Get rating
rating = soup.find_all("span", class_="meter-value superPageFontColor")[0].text
print("Rating: " + rating)

divs = soup.find_all("div", id="scoreStats")[0].find_all("div", class_="superPageFontColor")
for div in divs:
    if(len(div.find_all("span")) == 2):
        print(div.find_all("span")[0].text + div.find_all("span")[1].text)
    else:
        print(div.text.strip())

critic = soup.find_all("p", class_="critic_consensus superPageFontColor")[0]
print(critic.text)

### AUDIENCE SCORE
print("\n----- AUDIENCE SCORE -----")
# Get audience
audience = soup.find_all("span", class_="superPageFontColor")[0].text
print("Rating: " + audience)

divs = soup.find_all("div", class_="audience-info hidden-xs superPageFontColor")[0].find_all("div")
for div in divs:
    print(div.text.strip())
    
print("\n----- Movie Info -----")
# Synopsis
movie_synopsis = soup.find_all("div", id="movieSynopsis")[0]
print("Synopsis: " + movie_synopsis.text)
# Box Office
meta_row = soup.find_all("div", class_="media-body")[0].find_all("ul", class_="content-meta info")
print(meta_row)
# .find_all("li")
for meta in meta_row:
    print(meta)

# Filmow #To-do

# Getting movie theater schedule data

## Cinépolis

In [None]:
def generate_cinepolis_soup(cod_cinema, cod_claquete):
    body = {
        'cod_cinema'   : cod_cinema,
        'cod_claquete' : cod_claquete,
        'cod_horario'  : time.strftime("%Y-%m-%d"),
        'cod_filme'    : '0'
    }
    page = requests.post('http://www.cinepolis.com.br/programacao/ajax/ajax.conteudo_horarios.php', data = body)
    return BeautifulSoup(page.content, 'html.parser')

In [None]:
# Helper
def has_movie(row):
    return row.find(attrs={'data-order': re.compile('\w+')})

def get_cinepolis_schedule_data(soup, movie_theater):
    table = soup.find("table", { "class" : "tabelahorarios" })
    rows = table.find_all('tr')

    entries = [row for row in rows if has_movie(row)]

    rows_list = []

    for entry in entries:
        title = entry.find_all(href=re.compile('http://www.cinepolis.com.br/filmes/filme.php'))[0].text

        room = entry.find_all('td')[0].text

        tags = []
        if(entry.find("a", { "class" : "icovip" })):
            tags.append('VIP')
        if(entry.find("a", { "class" : "icomacroxe" })):
            tags.append('MacroXE')
        if(entry.find("a", { "class" : "ico3d" })):
            tags.append('3D')
        tags = ' / '.join(tags)

        content_rating = entry.find_all('td')[2].find('img').get('alt')

        category = entry.find("td", { "class" : "horarios" }).find('span').get('aria-label')

        schedules_tags = entry.find("td", { "class" : "horarios" }).select("span + span, a")
        schedules = ' / '.join([s.text for s in schedules_tags])

        row_dict = {
            'room': room,
            'title': title,
            'tags': tags,
            'content_rating': content_rating,
            'category': category,
            'schedules': schedules,
            'movie_theater': movie_theater
        }
        rows_list.append(row_dict)

    return pd.DataFrame(rows_list)

### Cinépolis Natal Shopping

In [None]:
cinepolis_natal_shopping_soup = generate_cinepolis_soup('31', '769')
cinepolis_natal_shopping_df = get_cinepolis_schedule_data(cinepolis_natal_shopping_soup, 'Cinépolis Natal Shopping')
cinepolis_natal_shopping_df

### Cinépolis Partage Norte Shopping Natal

In [None]:
cinepolis_norte_shopping_soup = generate_cinepolis_soup('33', '770')
cinepolis_norte_shopping_df = get_cinepolis_schedule_data(cinepolis_norte_shopping_soup, 'Cinépolis Partage Norte Shopping Natal')
cinepolis_norte_shopping_df

## Cinemark

In [None]:
response = requests.get('https://www.cinemark.com.br/natal/cinemas')
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
table = soup.select('div.section.times div.tabs-content > div.active')[0]
rows = table.find_all('div', { 'class': 'theater' })

entries = rows

rows_list = []

def has_content_rating_data(tag):
    return 'Classificação' in tag.select('strong')[0].text

for entry in entries:
    title_tag = entry.select('h3.title > a')[0]
    title = title_tag.text
    movie_url = 'https://www.cinemark.com.br' + title_tag.get('href')
    
    movie_response = requests.get(movie_url)
    movie_soup = BeautifulSoup(movie_response.content, 'html.parser'
                              )
    detail_items = movie_soup.find_all('div', { 'class' : 'detail-title' })
    content_rating_tag = [item for item in detail_items if has_content_rating_data(item)][0]
    content_rating = content_rating_tag.get_text().split('Classificação: ')[1].strip()
    
    times = entry.select('ul.theater-times > li')
    
    for time in times:
        room = time.find('span', { 'class' : 'times-auditorium' }).text

        schedules_tags = time.find('ul', { 'class' : 'times-options' }).select('li > span')
        schedules = ' / '.join([s.text for s in schedules_tags])

        tags = []
        if(time.find('span', { 'class' : 'label-dbox' })):
            tags.append('DBOX')
        if(time.find('span', { 'class' : 'label-3d' })):
            tags.append('3D')
        tags = ' / '.join(tags)
        
        category = ''
        if(time.find('span', { 'class' : 'label-leg' })):
            category = 'Legendado'
        if(time.find('span', { 'class' : 'label-dub' })):
            category = 'Dublado'
        if(time.find('span', { 'class' : 'label-orig' })):
            category = 'Dublado'

        row_dict = {
            'room': room,
            'title': title,
            'tags': tags,
            'content_rating': content_rating,
            'category': category,
            'schedules': schedules,
            'movie_theater': 'Cinemark Midway Mall Natal'
        }
        rows_list.append(row_dict)

cinemark_midway_df = pd.DataFrame(rows_list)

In [None]:
cinemark_midway_df