In [15]:
import pandas as pd
from bs4 import BeautifulSoup
import os

In [185]:
#read in 2017 Rotten Tomatoes top 100 movie rankings, this file was provided
df_critic = pd.read_csv('bestofrt.tsv', sep='\t')

#display the first few rows
df_critic.head()

Unnamed: 0,ranking,critic_score,title,number_of_critic_ratings
0,1,99,The Wizard of Oz (1939),110
1,2,100,Citizen Kane (1941),75
2,3,100,The Third Man (1949),77
3,4,99,Get Out (2017),282
4,5,97,Mad Max: Fury Road (2015),370


The data set containing movie reviews was provided, but could have been accessed using web scraping. This would be difficult in this instance because the movie rankings change in time. 

To access HTML data and use it for web scraping:
```import requests

#url for desired HTML
url = 'https://www.rottentomatoes.com/m/et_the_extraterrestrial'
response = requests.get(url)

#now save HTML to a file
with open('et_the_extraterrestrial.html', mode = 'wb) as file:
    file.write(response.content)

#OR work with HTML in memory, using Beautiful Soup
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, 'lxml')
```

In [173]:
#pass the file of interest (in this case the ET HTML file) as a file handle (in this case file)
with open('rt_html/the_cabinet_of_dr_caligari.html') as file:
    soup = BeautifulSoup(file, 'lxml')  #then pass that file handle (file) into the BeautifulSoup constructor, 'lxml' is the parser used for processing HTML in Python

#soup #display the HTML "soup" of our file (called soup)

In [14]:
#find the title of the page within the contents of the tags, 
#the contents become a list of length 1, so the string at index [0] is our data of interest, 
#then slice the string to yield only the title
soup.find('title').contents[0][:-len(' - Rotten Tomatoes')] 

#\xao is unicode for non-breaking space

'E.T. The Extra-Terrestrial\xa0(1982)'

In [183]:
#now, create a list of dictionaries of all the data desired from the HTML files
#this is the user data - I already have the critic info
df_list = [] #initialize the list - this will be a list of dictionaries
folder = 'rt_html' #filepath for folder containing HTML files

for movie in os.listdir(folder):
    with open(os.path.join(folder, movie)) as file:
        soup = BeautifulSoup(file, 'lxml')
        title = soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]
        audience_score = int(soup.find_all('span', 'superPageFontColor', 'vertical-align:top')[-4].contents[0][:-1])
        users = str(soup.find('div', 'audience-info hidden-xs superPageFontColor').contents[-2])[len('<div>\n<span class="subtle superPageFontColor">User Ratings:</span>\n        '):-len('</div>')]
        audience_n = int(users.replace(',',''))
        df_list.append({'title': title, 'audience_score': audience_score, 'number_of_audience_ratings':audience_n})

In [186]:
#now make my list of dictionaries into a pandas dataframe
df_audience = pd.DataFrame(df_list, columns = ['title', 'audience_score', 'number_of_audience_ratings'])
df_audience

Unnamed: 0,title,audience_score,number_of_audience_ratings
0,12 Angry Men (Twelve Angry Men) (1957),97,103672
1,The 39 Steps (1935),86,23647
2,The Adventures of Robin Hood (1938),89,33584
3,All About Eve (1950),94,44564
4,All Quiet on the Western Front (1930),89,17768
...,...,...,...
95,Up (2009),90,1201878
96,Vertigo (1958),93,101454
97,The Wages of Fear (1953),95,8536
98,Wonder Woman (2017),90,112955


In [221]:
for i in range(0, len(df_audience)):
    df_audience.title[i] = df_audience.title[i].replace('\xa0',' ')

df_audience.title[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_audience.title[i] = df_audience.title[i].replace('\xa0',' ')


'12 Angry Men (Twelve Angry Men) (1957)'

In [224]:
df = df_critic.set_index('title').join(df_audience.set_index('title'), how='outer')
df

Unnamed: 0_level_0,ranking,critic_score,number_of_critic_ratings,audience_score,number_of_audience_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12 Angry Men (Twelve Angry Men) (1957),53.0,100.0,49.0,97.0,103672.0
12 Years a Slave (2013),29.0,96.0,316.0,90.0,138789.0
A Hard Day's Night (1964),22.0,98.0,104.0,89.0,50067.0
A Streetcar Named Desire (1951),60.0,98.0,54.0,90.0,54761.0
Alien (1979),48.0,97.0,104.0,94.0,457186.0
...,...,...,...,...,...
Toy Story 3 (2010),39.0,99.0,291.0,89.0,605098.0
Up (2009),52.0,98.0,286.0,90.0,1201878.0
Vertigo (1958),66.0,97.0,64.0,93.0,101454.0
Wonder Woman (2017),46.0,92.0,333.0,90.0,112955.0


In [233]:
df[df.isna().any(axis=1)]

Unnamed: 0_level_0,ranking,critic_score,number_of_critic_ratings,audience_score,number_of_audience_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Army of Shadows (L'ArmÃ©e des ombres) (1969),,,,94.0,7011.0
Army of Shadows (L'Armée des ombres) (1969),57.0,97.0,73.0,,
RashÃ´mon (1951),,,,93.0,47657.0
Rashômon (1951),35.0,100.0,50.0,,
Tokyo Story (TÃ´kyÃ´ monogatari) (1953),,,,93.0,11325.0
Tokyo Story (Tôkyô monogatari) (1953),82.0,100.0,42.0,,
