### Data Source
https://developer.imdb.com/non-commercial-datasets/

In [129]:
# !python3 -m wget https://datasets.imdbws.com/title.basics.tsv.gz
# !python3 -m wget https://datasets.imdbws.com/title.akas.tsv.gz
# !python3 -m wget https://datasets.imdbws.com/title.ratings.tsv.gz
# !python3 -m wget https://datasets.imdbws.com/title.crew.tsv.gz
# !python3 -m wget https://datasets.imdbws.com/name.basics.tsv.gz

In [130]:
import pandas

In [131]:
title_basics = pandas.read_csv('title.basics.tsv.gz', delimiter='\t', low_memory=False)
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10125398 entries, 0 to 10125397
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 695.3+ MB


In [132]:
title_akas = pandas.read_csv('title.akas.tsv.gz', delimiter='\t', low_memory=False)
title_akas.rename(columns={'titleId': 'tconst'}, inplace=True)
title_akas = title_akas[['tconst', 'region', 'attributes']]
title_akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37044374 entries, 0 to 37044373
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   tconst      object
 1   region      object
 2   attributes  object
dtypes: object(3)
memory usage: 847.9+ MB


In [133]:
title_ratings = pandas.read_csv('title.ratings.tsv.gz', delimiter='\t')
title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1344083 entries, 0 to 1344082
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1344083 non-null  object 
 1   averageRating  1344083 non-null  float64
 2   numVotes       1344083 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.8+ MB


In [134]:
title_crew = pandas.read_csv('title.crew.tsv.gz', delimiter='\t')
title_crew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10125398 entries, 0 to 10125397
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   tconst     object
 1   directors  object
 2   writers    object
dtypes: object(3)
memory usage: 231.8+ MB


In [135]:
name_basics = pandas.read_csv('name.basics.tsv.gz', delimiter='\t', low_memory=False)
name_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12777301 entries, 0 to 12777300
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   nconst             object
 1   primaryName        object
 2   birthYear          object
 3   deathYear          object
 4   primaryProfession  object
 5   knownForTitles     object
dtypes: object(6)
memory usage: 584.9+ MB


In [136]:
movies = title_basics[title_basics['titleType'] == 'movie']
movies = pandas.merge(movies, title_ratings, on='tconst')
movies = pandas.merge(movies, title_crew, on='tconst')
movies = pandas.merge(movies, title_akas, on='tconst')
movies.head(20)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers,region,attributes
0,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance,5.3,205,nm0085156,nm0085156,\N,\N
1,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance,5.3,205,nm0085156,nm0085156,AU,\N
2,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance,5.3,205,nm0085156,nm0085156,DE,literal title
3,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance,5.3,205,nm0085156,nm0085156,HU,\N
4,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance,5.3,205,nm0085156,nm0085156,US,\N
5,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport",5.3,480,nm0714557,\N,US,\N
6,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport",5.3,480,nm0714557,\N,RU,\N
7,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport",5.3,480,nm0714557,\N,\N,\N
8,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N,4.1,15,nm0063413,"nm0063413,nm0657268,nm0675388",\N,\N
9,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N,4.1,15,nm0063413,"nm0063413,nm0657268,nm0675388",ES,\N


In [137]:
# Preprocess name_basics into a dictionary for fast lookups
name_dict = name_basics.set_index('nconst')['primaryName'].to_dict()


def map_ids_to_names(ids: str):
    if pandas.isna(ids):
        return None

    id_list = ids.split(',')
    names = [name_dict.get(id_) for id_ in id_list if id_ in name_dict]
    return ', '.join(names)


# Apply the function to the directors and writers columns
movies['directorNames'] = movies['directors'].apply(map_ids_to_names)
movies['writerNames'] = movies['writers'].apply(map_ids_to_names)

In [138]:
# Replace column values where value is \N to None
movies = movies.replace(r'\N', None)

# Get movies where attributes is not null
movies[movies['attributes'].notnull()].head(20)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers,region,attributes,directorNames,writerNames
2,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45.0,Romance,5.3,205,nm0085156,nm0085156,DE,literal title,Alexander Black,Alexander Black
41,tt0000941,movie,Locura de amor,Locura de amor,0,1909,,45.0,Drama,4.5,27,"nm0063413,nm0550220","nm0063413,nm0550220,nm0848502",XWW,informal literal title,"Ricardo de Baños, Alberto Marro","Ricardo de Baños, Alberto Marro, Manuel Tamayo..."
80,tt0001175,movie,Camille,La dame aux camélias,0,1912,,,"Drama,Romance",5.1,42,"nm0130633,nm0580197,nm0693516","nm0241414,nm0693516",DE,complete title,"André Calmettes, Louis Mercanton, Henri Pouctal","Alexandre Dumas fils, Henri Pouctal"
90,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50.0,"Biography,Drama,Family",5.4,59,nm0085865,"nm0676645,nm0836316",US,second part title,J. Stuart Blackton,"Madison C. Peters, Rollin S. Sturgeon"
93,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50.0,"Biography,Drama,Family",5.4,59,nm0085865,"nm0676645,nm0836316",US,fourth season title,J. Stuart Blackton,"Madison C. Peters, Rollin S. Sturgeon"
96,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50.0,"Biography,Drama,Family",5.4,59,nm0085865,"nm0676645,nm0836316",US,third part title,J. Stuart Blackton,"Madison C. Peters, Rollin S. Sturgeon"
97,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50.0,"Biography,Drama,Family",5.4,59,nm0085865,"nm0676645,nm0836316",US,fifth season title,J. Stuart Blackton,"Madison C. Peters, Rollin S. Sturgeon"
130,tt0001531,movie,"Captain Starlight, or Gentleman of the Road","Captain Starlight, or Gentleman of the Road",0,1911,,,,4.6,15,nm0738202,"nm0092809,nm1010943",AU,short title,Alfred Rolfe,"Rolf Boldrewood, Alfred Dampier"
140,tt0001592,movie,In the Prime of Life,Ekspeditricen,0,1911,,52.0,Drama,6.0,25,nm0088881,nm0491503,SE,premiere title,August Blom,Lau Lauritzen
157,tt0001630,movie,Der fremde Vogel,Der fremde Vogel,0,1911,,45.0,,6.2,36,nm0300487,nm0300487,DK,premiere title,Urban Gad,Urban Gad


In [139]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2212120 entries, 0 to 2212119
Data columns (total 17 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   titleType       object 
 2   primaryTitle    object 
 3   originalTitle   object 
 4   isAdult         object 
 5   startYear       object 
 6   endYear         object 
 7   runtimeMinutes  object 
 8   genres          object 
 9   averageRating   float64
 10  numVotes        int64  
 11  directors       object 
 12  writers         object 
 13  region          object 
 14  attributes      object 
 15  directorNames   object 
 16  writerNames     object 
dtypes: float64(1), int64(1), object(15)
memory usage: 286.9+ MB


In [140]:
# Drop rows where isAdult is different from 0
movies = movies[movies['isAdult'] != 0]

In [141]:
# Drop rows where originalTitle and years are repeated
movies = movies.drop_duplicates(subset=['originalTitle', 'startYear'])

In [142]:
# Drop rows where genres is null
movies = movies[movies['genres'].notnull()]

In [143]:
# Drop rows where primaryTitle is null or empty or <unset>
movies = movies[movies['primaryTitle'].notnull()]
movies = movies[movies['primaryTitle'] != '']
movies = movies[movies['primaryTitle'] != '<unset>']

In [144]:
movies = movies.drop(columns=[
    'isAdult',
    'originalTitle',
    'endYear',
    'titleType',
    'runtimeMinutes',
    'attributes',
    'directors',
    'writers',
])

# Web Scraping

In [145]:
import requests
from bs4 import BeautifulSoup

In [146]:
def movie_url(movie_id):
    return f'https://www.imdb.com/title/{movie_id}'

In [147]:
def get_html(url: str) -> BeautifulSoup | None:
    user_agents = [
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/74.0.3729.157 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15"
    ]
    response = requests.get(url, headers={"User-agent": user_agents[1]})

    if response.status_code != 200:
        return None

    return response.text

In [148]:
def get_movie_description(movie_id) -> str | None:
    url = movie_url(movie_id)
    html = get_html(url)
    try:
        soup = BeautifulSoup(html, 'html.parser')
        description = soup.find('span', {'data-testid': 'plot-xs_to_m'}).text
        description.replace(';', ',')
        return description
    except Exception:
        return ''

In [149]:
# sort the movies by averageRating and numVotes
movies = movies.sort_values(by=['numVotes', 'averageRating'], ascending=False)

In [151]:
if not 'description' in movies.columns:
    # Add description column
    movies['description'] = None

    # Load movies from csv
    local_movies = pandas.read_csv('movies.csv')

    # Merge local movies with moviesg
    movies = pandas.merge(movies, local_movies[['tconst', 'description']], on='tconst', how='left')


In [None]:
import time

missing_movies = movies[movies['description'].isnull()]

start = 0
for size in range(50, len(missing_movies), 50):
    batch = missing_movies.iloc[start:size]
    batch['description'] = batch['tconst'].apply(get_movie_description)
    batch.to_csv('movies.csv', mode='a', header=False, index=False)

    start = size
    time.sleep(1)