### Import the necessary modules for web scraping

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

### Get the movie paths from 'the top 250 movie' page

In [2]:
base_url = 'https://www.imdb.com/'
top_250_url = 'https://www.imdb.com/chart/top'

response = requests.get(top_250_url)
soup = BeautifulSoup(markup=response.content, features='html.parser')
movies = [(e.get_text(), e.get('href')) for e in soup.select('.titleColumn > a')]

### Construct a dataframes that holds the categories

In [3]:
cols = [
    'Action', 'Drama', 'Adventure', 'Comedy', 'Animation', 'Sci-Fi', 'Fantasy', 'Crime',
    'Thriller', 'Family', 'Romance', 'Short', 'Mystery', 'Sport', 'Horror',
    'War', 'History', 'Reality-TV', 'Western', 'Game-Show', 'Documentary', 'Music',
    'Biography', 'Talk-Show', 'Musical', 'News', 'Film-Noir', 'Adult'
]

df_category = pd.DataFrame(columns=cols)

for movie_name, movie_path in movies:
    # Get the categories
    
    response = requests.get(base_url + movie_path)
    soup = BeautifulSoup(markup=response.content, features='html.parser')
    category = [e.get_text() for e in soup.select('.subtext > a')][:-1]
    
    # Append the binarized data to the dataframe
    
    row = [any([True if cg == col else False for cg in category]) for col in cols]
    df_category = df_category.append(pd.Series(data=row, index=cols, name=movie_name))

### Convert dataframe data to numeric values and save them to disk

In [4]:
df_category = df_category.astype('bool')
df_category = df_category.rename_axis('Movie')
df_category.to_csv('dataset/category.csv')