### Import the necessary modules for web scraping

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

### Get the movie paths from 'the top 250 movie' page

In [2]:
base_url = 'https://www.imdb.com/'
top_250_url = 'https://www.imdb.com/chart/top'
rating_path = 'ratings?ref_=tt_ov_rt'

response = requests.get(top_250_url)
soup = BeautifulSoup(markup=response.content, features='html.parser')
movies = [(e.get_text(), e.get('href')) for e in soup.select('.titleColumn > a')]

### Construct two dataframes that holds the rating means and rating counts

In [3]:
cols = [
    'All', 'All(<18)', 'All(18-29)', 'All(30-44)', 'All(45+)',
    'Male', 'Male(<18)', 'Male(18-29)', 'Male(30-44)', 'Male(45+)', 
    'Female', 'Female(<18)', 'Female(18-29)', 'Female(30-44)', 'Female(45+)'
]

df_rating = pd.DataFrame(columns=cols)
df_dist = pd.DataFrame(columns=cols)

for movie_name, movie_path in movies:
    
    # Get the rating table in the page
    
    response = requests.get(base_url + movie_path + rating_path)
    soup = BeautifulSoup(markup=response.content, features='html.parser')
    rating_table = soup.find_all('table')[1]
    
    # Add rating means to rating dataframe
    
    row = [e.get_text().strip() for e in rating_table.select('.bigcell')]
    df_rating = df_rating.append(pd.Series(data=row, index=cols, name=movie_name))
    
    # Add rating counts to distrubtion dataframe
    
    row = [e.get_text().strip() for e in rating_table.select('.smallcell > a')]
    df_dist = df_dist.append(pd.Series(data=row, index=cols, name=movie_name))

### Convert dataframe data to numeric values and save them to disk

In [4]:
df_rating = df_rating.astype('float32')
df_dist = df_dist.replace(regex={',': ''}).astype('int32')

df_rating = df_rating.rename_axis('Movie')
df_dist = df_dist.rename_axis('Movie')

df_rating.to_csv('dataset/rating.csv')
df_dist.to_csv('dataset/distribution.csv')