In [100]:
import pandas as pd
import ast
import pycountry
import numpy as np

In [101]:
df = pd.read_csv('data/titles.csv')
df.tail()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
5845,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,"['romance', 'drama']",['NG'],,tt13857480,6.8,45.0,1.466,
5846,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,['drama'],[],,tt11803618,7.7,348.0,,
5847,tm1059008,Lokillo,MOVIE,A controversial TV host and comedian who has b...,2021,,90,['comedy'],['CO'],,tt14585902,3.8,68.0,26.005,6.3
5848,tm1035612,Dad Stop Embarrassing Me - The Afterparty,MOVIE,"Jamie Foxx, David Alan Grier and more from the...",2021,PG-13,37,[],['US'],,,,,1.296,10.0
5849,ts271048,Mighty Little Bheem: Kite Festival,SHOW,"With winter behind them, Bheem and his townspe...",2021,,7,"['family', 'animation', 'comedy']",[],1.0,tt13711094,7.8,18.0,2.289,10.0


In [102]:
def code_to_name(code):
    try:
        country = pycountry.countries.get(alpha_2=code.upper())
        if country:
            return country.name
        return code
    except:
        return code

region_map = {
    "United States": "North America",
    "Canada": "North America",
    "Mexico": "North America",
    "Puerto Rico": "North America",
    "Brazil": "South America",
    "Argentina": "South America",
    "Chile": "South America",
    "Colombia": "South America",
    "Peru": "South America",
    "United Kingdom": "Europe",
    "France": "Europe",
    "Germany": "Europe",
    "Italy": "Europe",
    "Spain": "Europe",
    "Netherlands": "Europe",
    "Sweden": "Europe",
    "Norway": "Europe",
    "Poland": "Europe",
    "Czechia": "Europe",
    "Greece": "Europe",
    "Austria": "Europe",
    "Denmark": "Europe",
    "Switzerland": "Europe",
    "Belgium": "Europe",
    "Romania": "Europe",
    "Portugal": "Europe",
    "Luxembourg": "Europe",
    "Hungary": "Europe",
    "Iceland": "Europe",
    "Ireland": "Europe",
    "Malta": "Europe",
    "Bulgaria": "Europe",
    "Russia": "Europe",
    "Finland": "Europe",
    "Ukraine": "Europe",
    "Belarus": "Europe",
    "Croatia": "Europe",
    "Vatican City": "Europe",
    "Monaco": "Europe",
    "Cyprus": "Europe",
    "Lithuania": "Europe",
    "Albania": "Europe",
    "Serbia": "Europe",
    "China": "Asia",
    "Japan": "Asia",
    "South Korea": "Asia",
    "India": "Asia",
    "Singapore": "Asia",
    "Thailand": "Asia",
    "Taiwan": "Asia",
    "Philippines": "Asia",
    "Hong Kong": "Asia",
    "Indonesia": "Asia",
    "Malaysia": "Asia",
    "Cambodia": "Asia",
    "Pakistan": "Asia",
    "United Arab Emirates": "Middle East",
    "Saudi Arabia": "Middle East",
    "Lebanon": "Middle East",
    "Iran": "Middle East",
    "Israel": "Middle East",
    "Turkey": "Middle East",
    "Jordan": "Middle East",
    "Qatar": "Middle East",
    "Kuwait": "Middle East",
    "Syria": "Middle East",
    "Palestine": "Middle East",
    "South Africa": "Africa",
    "Nigeria": "Africa",
    "Egypt": "Africa",
    "Kenya": "Africa",
    "Senegal": "Africa",
    "Morocco": "Africa",
    "Tunisia": "Africa",
    "Ghana": "Africa",
    "Zimbabwe": "Africa",
    "Australia": "Oceania",
    "New Zealand": "Oceania"
}

def get_regions_from_names(country_names):
    regions = {region_map[c] for c in country_names if c in region_map}
    return list(regions) if regions else np.nan

def ensure_list_of_strings(x):
    if isinstance(x, list):
        return x
    elif isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except:
            return [x]
    else:
        return []

df['genres'] = df['genres'].apply(ensure_list_of_strings)
df['production_countries'] = df['production_countries'].apply(ensure_list_of_strings)

df['production_countries'] = df['production_countries'].apply(
    lambda countries: [code_to_name(c) for c in countries]
)

df['regions'] = df['production_countries'].apply(get_regions_from_names)

df['production_countries'] = df['production_countries'].apply(lambda x: np.nan if not x else x)
df['genres'] = df['genres'].apply(lambda x: np.nan if not x else x)

df.head()


Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,regions
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,[documentation],[United States],1.0,,,,0.6,,[North America]
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"[drama, crime]",[United States],,tt0075314,8.2,808582.0,40.965,8.179,[North America]
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"[drama, action, thriller, european]",[United States],,tt0068473,7.7,107673.0,10.01,7.3,[North America]
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"[fantasy, action, comedy]",[United Kingdom],,tt0071853,8.2,534486.0,15.461,7.811,[Europe]
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"[war, action]","[United Kingdom, United States]",,tt0061578,7.7,72662.0,20.398,7.6,"[North America, Europe]"


In [103]:
df = df.drop(columns=['imdb_id'])
df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,regions
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,[documentation],[United States],1.0,,,0.6,,[North America]
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"[drama, crime]",[United States],,8.2,808582.0,40.965,8.179,[North America]
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"[drama, action, thriller, european]",[United States],,7.7,107673.0,10.01,7.3,[North America]
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"[fantasy, action, comedy]",[United Kingdom],,8.2,534486.0,15.461,7.811,[Europe]
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"[war, action]","[United Kingdom, United States]",,7.7,72662.0,20.398,7.6,"[North America, Europe]"


In [104]:
df.to_csv('data/preprocessed.csv', index=False)