# Web Scraping Cinema

Extracting movies schedule on Natal movie theaters and getting information about these movies in reference sites.

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

In [None]:
url_imdb = 'http://www.imdb.com/title/tt0451279/?ref_=nv_sr_1'
url_tomates = 'https://www.rottentomatoes.com/m/wonder_woman_2017'
url_filmow = 'https://filmow.com/mulher-maravilha-t48706/'

# IMDb

In [None]:
# Get the page content and set up a new parser
response = requests.get(url_imdb)
content = response.content
soup = BeautifulSoup(content, 'html.parser')

# Get title
title_imd = soup.find_all("h1", itemprop="name")[0].text

# Get rating
rating_imdb = soup.find_all("span", itemprop="ratingValue")[0].text
userRating_imdb = soup.find_all("span", itemprop="ratingCount")[0].text

# Get time
time_imdb = soup.find_all("time", itemprop="duration")[0].text

# Get genres
genres = soup.find_all("span", itemprop="genre", class_="itemprop")
g_imdb = ""
for genre in genres:
    g_imdb = g_imdb + genre.text + ", "

# Rotten tomates

In [None]:
# Get the page content and set up a new parser
response = requests.get(url_tomates)
content = response.content
soup = BeautifulSoup(content, 'html.parser')

### TOMATOMETER
# Get rating
rating_tmt = soup.find_all("span", class_="meter-value superPageFontColor")[0].text
divs_tmt = soup.find_all("div", id="scoreStats")[0].find_all("div", class_="superPageFontColor")
critic_tmt = soup.find_all("p", class_="critic_consensus superPageFontColor")[0]

### AUDIENCE SCORE
# Get audience
audience_tmt = soup.find_all("span", class_="superPageFontColor")[0].text
divs_tmt_audience = soup.find_all("div", class_="audience-info hidden-xs superPageFontColor")[0].find_all("div")

# Synopsis
movie_synopsis_tmt = soup.find_all("div", id="movieSynopsis")[0]

# Filmow

In [None]:
# Get the page content and set up a new parser
response = requests.get(url_filmow)
content = response.content
soup = BeautifulSoup(content, 'html.parser')

# Get cast
rating_filmow = soup.find_all("span", itemprop="ratingValue")[0].text
ratingCount_filmow = soup.find_all("span", itemprop="ratingCount")[0].text

# Get cast
lis_filmow = soup.find_all("ul", id="casting")[0].find_all("li")

In [None]:
# IMDB
print("Title: " + title_imd.strip())
print("Rating (IMDB): " + rating_imdb + "/10 (" + userRating_imdb + " users rating)")
print("Time: " + time_imdb.strip())
print("Genres: " + g_imdb[0:-2])

# Rotten Tomates
print("\n----- TOMATOMETER -----")
print("Rating: " + rating_tmt)
for div in divs_tmt:
    if(len(div.find_all("span")) == 2):
        print(div.find_all("span")[0].text + div.find_all("span")[1].text)
    else:
        print(div.text.strip())
print(critic_tmt.text_tmt)
print("\n----- AUDIENCE SCORE -----")
print("Audience rating: " + audience_tmt)
for div in divs_tmt_audience:
    print(div.text.strip())
print("\n----- Movie Info -----")
print("Synopsis: " + movie_synopsis_tmt.text)

# Filmow
print("Rating (Filmow): " + rating_filmow + " (Based on " + ratingCount_filmow + " votes)")
print("\n----- Casting -----")
for li in lis_filmow:
    actor = li.find_all("span", itemprop="name")[0].text
    paper = li.find_all("em")
    if paper:
        print(actor + " - " + paper[0].text)
    else:
        print(actor)

# Getting movie theater schedule data

In [None]:
# Add the lib package to the system path, so that we can include modules from there.

import os
import sys

module_path = os.path.abspath(os.path.join('./'))

if module_path not in sys.path:
    sys.path.append(module_path)
    
from lib.movie_theater_schedule.cinepolis_schedule import CinepolisSchedule
from lib.movie_theater_schedule.cinemark_schedule import CinemarkSchedule

## Cinépolis

In [None]:
def generate_cinepolis_soup(cod_cinema, cod_claquete):
    body = {
        'cod_cinema'   : cod_cinema,
        'cod_claquete' : cod_claquete,
        'cod_horario'  : time.strftime("%Y-%m-%d"),
        'cod_filme'    : '0'
    }
    page = requests.post('http://www.cinepolis.com.br/programacao/ajax/ajax.conteudo_horarios.php', data = body)
    return BeautifulSoup(page.content, 'html.parser')

In [None]:
def get_cinepolis_schedule_data(soup, movie_theater):
    table = soup.find("table", { "class" : "tabelahorarios" })
    rows = table.find_all('tr')
    
    has_movie = lambda row : row.find(attrs={'data-order': re.compile('\w+')})

    entries = [row for row in rows if has_movie(row)]

    rows_list = []

    for entry in entries:
        title = entry.find_all(href=re.compile('http://www.cinepolis.com.br/filmes/filme.php'))[0].text

        room = entry.find_all('td')[0].text

        tags = []
        if(entry.find("a", { "class" : "icovip" })):
            tags.append('VIP')
        if(entry.find("a", { "class" : "icomacroxe" })):
            tags.append('MacroXE')
        if(entry.find("a", { "class" : "ico3d" })):
            tags.append('3D')
        tags = ' / '.join(tags)

        content_rating = entry.find_all('td')[2].find('img').get('alt')

        category = entry.find("td", { "class" : "horarios" }).find('span').get('aria-label')

        schedules_tags = entry.find("td", { "class" : "horarios" }).select("span + span, a")
        schedules = ' / '.join([s.text for s in schedules_tags])

        row_dict = {
            'room': room,
            'title': title,
            'tags': tags,
            'content_rating': content_rating,
            'category': category,
            'schedules': schedules,
            'movie_theater': movie_theater
        }
        rows_list.append(row_dict)

    return pd.DataFrame(rows_list)

### Cinépolis Natal Shopping

In [None]:
cinepolis_natal_soup = generate_cinepolis_soup('31', '769')
cinepolis_natal_schedule = CinepolisSchedule(cinepolis_natal_soup, 'Cinépolis Natal Shopping')
cinepolis_natal_schedule_df = cinepolis_natal_schedule.get_dataframe()
cinepolis_natal_schedule_df

### Cinépolis Partage Norte Shopping Natal

In [None]:
cinepolis_norte_soup = generate_cinepolis_soup('33', '770')
cinepolis_norte_schedule = CinepolisSchedule(cinepolis_norte_soup, 'Cinépolis Partage Norte Shopping Natal')
cinepolis_norte_schedule_df = cinepolis_norte_schedule.get_dataframe()
cinepolis_norte_schedule_df

## Cinemark

In [None]:
response = requests.get('https://www.cinemark.com.br/natal/cinemas')
cinemark_soup = BeautifulSoup(response.content, 'html.parser')
cinemark_schedule = CinemarkSchedule(cinemark_soup, 'Cinemark Midway Mall Natal')
cinemark_schedule_df = cinemark_schedule.get_dataframe()
cinemark_schedule_df

## Movie theaters schedule data

In [None]:
frames = [cinepolis_natal_schedule_df, cinepolis_norte_schedule_df, cinemark_schedule_df]
movie_theaters_schedule_data = pd.concat(frames)

In [None]:
movie_theaters_schedule_data.sample()

# Movies data on IMDB

In [None]:
movies_data = movie_theaters_schedule_data.groupby('title')['movie_url'].unique().reset_index()

imdb_url_series = pd.Series(None, index = movies_data.index, name = 'imdb_url')
movies_data = movies_data.join(imdb_url_series)
movies_data

## Obtain movie cast on original theater site

In [None]:
def get_cinemark_movie_cast(soup):
    detail_items = soup.find_all('div', { 'class' : 'detail-title' })

    has_label_data = lambda tag : 'Elenco' in tag.select('strong')[0].text

    detail_tag = [item for item in detail_items if has_label_data(item)][0]
    actors = detail_tag.get_text().split('Elenco: ')[1].strip().split(', ')
    return [' '.join(actor.split(' ')[:2]) for actor in actors]

def get_cinepolis_movie_cast(soup):
    content = soup.select('.conteudo > .direita')[0].get_text()
    content = re.sub('[\s][ ]+', '', content)
    content = re.sub('\r\n', '', content)
    content = re.sub('\n', '', content)
    
    m = re.search('Elenco(.*)Roteiro', content)
    if m:
        found = m.group(1)
        items = found.split(',')[:3]
        return [item.strip() for item in items]

## Find movie IMDB url using title and cast as reference

In [None]:
for index, row in movies_data.iterrows():
    url = row.movie_url[0]
    title = row.title

    movie_response = requests.get(url)
    movie_soup = BeautifulSoup(movie_response.content, 'html.parser')
    print('Searching IMDB url for: ' + title)
    
    if('cinepolis' in url):
        cast = get_cinepolis_movie_cast(movie_soup)
    else:
        cast = get_cinemark_movie_cast(movie_soup)

    imdb_url = ''

    if(cast is not None):
        query_token = '+'.join(title.split(' '))
        search_url = 'http://www.imdb.com/find?q={}&s=tt'.format(query_token)
        page = requests.get(search_url)
        soup = BeautifulSoup(page.content, 'html.parser')

        search_results = soup.select('.findResult .result_text > a')

        for link in search_results:
            if(imdb_url != ''):
                break

            movie_url = 'http://www.imdb.com' + link.get('href')
            movie_page = requests.get(movie_url)
            movie_soup = BeautifulSoup(movie_page.content, 'html.parser')

            actors_tags = movie_soup.find_all('span', { 'itemprop': 'name' })
            cast_imdb = [actor.text for actor in actors_tags]
            if(all(actor in cast_imdb for actor in cast)):
                imdb_url = movie_url

    movies_data.loc[index, 'imdb_url'] = imdb_url

In [None]:
movies_data

## Add movie data using IMDB

In [None]:
for serie_name in ['original_title', 'rating', 'time', 'genres']:
    series = pd.Series(None, index = movies_data.index, name = serie_name)
    movies_data = movies_data.join(series)

In [None]:
for index, row in movies_data.iterrows():
    url = row.imdb_url
    
    if('imdb' in url):   
        movie_response = requests.get(url)
        movie_soup = BeautifulSoup(movie_response.content, 'html.parser')

        title = ''
        rating = ''
        time = ''
        genres_str = ''

        title = movie_soup.find_all("h1", itemprop="name")[0].text

        rating = movie_soup.find_all("span", itemprop="ratingValue")[0].text

        time_tags = movie_soup.find_all("time", itemprop="duration")
        if(len(time_tags) > 0):
            time = movie_soup.find_all("time", itemprop="duration")[0].text

        genres = movie_soup.find_all("span", itemprop="genre", class_="itemprop")
        genres_str = ""
        for genre in genres:
            genres_str = genres_str + genre.text + ", "

        movies_data.loc[index, 'original_title'] = title
        movies_data.loc[index, 'rating'] = rating
        movies_data.loc[index, 'time'] = time
        movies_data.loc[index, 'genres'] = genres_str

In [None]:
movies_data

# Developers

- Álvaro Ferreira - [github.com/alvarofpp](https://github.com/alvarofpp)
- Gabriel Ribeiro - [github.com/Bib7/](https://github.com/Bib7/)
- Kaio Max - [github.com/kaiomax](https://github.com/kaiomax)

# References


- [https://www.crummy.com/software/BeautifulSoup/bs4/doc/][1]
- [https://imasters.com.br/desenvolvimento/aprendendo-sobre-web-scraping-em-python-utilizando-beautifulsoup/?trace=1519021197&source=single][2]
- [http://docs.python-requests.org/en/master/user/quickstart/#more-complicated-post-requests][3]
- [http://akul.me/blog/2016/beautifulsoup-cheatsheet/][4]
- [https://pymotw.com/2/abc/][5]

[1]: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
[2]: https://imasters.com.br/desenvolvimento/aprendendo-sobre-web-scraping-em-python-utilizando-beautifulsoup/?trace=1519021197&source=single
[3]: http://docs.python-requests.org/en/master/user/quickstart/#more-complicated-post-requests
[4]: http://akul.me/blog/2016/beautifulsoup-cheatsheet/
[5]: https://pymotw.com/2/abc/