## Gathering data for Rotten Tomatoes top 100 Movies

In [2]:
import pandas as pd
import os
from bs4 import BeautifulSoup as BS

#### Reading in the file with all movies with critic ratings 

In [3]:
df = pd.read_csv('bestofrt.tsv', sep = '\t')
df.head()

Unnamed: 0,ranking,critic_score,title,number_of_critic_ratings
0,1,99,The Wizard of Oz (1939),110
1,2,100,Citizen Kane (1941),75
2,3,100,The Third Man (1949),77
3,4,99,Get Out (2017),282
4,5,97,Mad Max: Fury Road (2015),370


#### Reading the files acquired by web scrapping to get all the viewer's ratings:

In [5]:
df_list = []
folder = 'rt_html'
for movie_html in os.listdir(folder):
    with open(os.path.join(folder, movie_html), encoding="utf8") as file:
        soup = BS(file, 'lxml')
        # extract title
        title = soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]
        # extract audience score
        audience_score = soup.find('div', class_ = 'audience-score meter').find('span').contents[0][:-1]
        # extract number of ratings
        num_audience_ratings = soup.find('div', class_ = 
             'audience-info hidden-xs superPageFontColor').find_all('div')[1].contents[2].strip().replace(',', '')
        # make a dictionary and append to the list
        df_list.append({'title': title,
                      'audience_score': int(audience_score),
                      'number_of_audience_ratings': int(num_audience_ratings)})
df = pd.DataFrame(df_list, columns = ['title', 'audience_score', 'number_of_audience_ratings'])

#### Test the solution

In [None]:
df_solution = pd.read_pickle('df_solution.pkl')
df.sort_values('title', inplace = True)
df.reset_index(inplace = True, drop = True)
df_solution.sort_values('title', inplace = True)
df_solution.reset_index(inplace = True, drop = True)
pd.testing.assert_frame_equal(df, df_solution)