In [75]:
import pandas as pd
import ast
import pycountry
import numpy as np

In [76]:
df = pd.read_csv('data/titles.csv')
df.tail()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
5845,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,"['romance', 'drama']",['NG'],,tt13857480,6.8,45.0,1.466,
5846,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,['drama'],[],,tt11803618,7.7,348.0,,
5847,tm1059008,Lokillo,MOVIE,A controversial TV host and comedian who has b...,2021,,90,['comedy'],['CO'],,tt14585902,3.8,68.0,26.005,6.3
5848,tm1035612,Dad Stop Embarrassing Me - The Afterparty,MOVIE,"Jamie Foxx, David Alan Grier and more from the...",2021,PG-13,37,[],['US'],,,,,1.296,10.0
5849,ts271048,Mighty Little Bheem: Kite Festival,SHOW,"With winter behind them, Bheem and his townspe...",2021,,7,"['family', 'animation', 'comedy']",[],1.0,tt13711094,7.8,18.0,2.289,10.0


In [77]:
def code_to_name(code):
    try:
        return pycountry.countries.get(alpha_2=code.upper()).name
    except:
        return code

df['production_countries'] = df['production_countries'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

df['production_countries'] = df['production_countries'].apply(
    lambda countries: [code_to_name(c) for c in countries]
)

region_map = {
    "United States": "North America",
    "Canada": "North America",
    "Mexico": "North America",
    "Puerto Rico": "North America",
    "Brazil": "South America",
    "Argentina": "South America",
    "Chile": "South America",
    "Colombia": "South America",
    "Peru": "South America",
    "United Kingdom": "Europe",
    "France": "Europe",
    "Germany": "Europe",
    "Italy": "Europe",
    "Spain": "Europe",
    "Netherlands": "Europe",
    "Sweden": "Europe",
    "Norway": "Europe",
    "Poland": "Europe",
    "Czechia": "Europe",
    "Greece": "Europe",
    "Austria": "Europe",
    "Denmark": "Europe",
    "Switzerland": "Europe",
    "Belgium": "Europe",
    "Romania": "Europe",
    "Portugal": "Europe",
    "Luxembourg": "Europe",
    "Hungary": "Europe",
    "Iceland": "Europe",
    "Ireland": "Europe",
    "Malta": "Europe",
    "Bulgaria": "Europe",
    "Russia": "Europe",
    "Finland": "Europe",
    "Ukraine": "Europe",
    "Belarus": "Europe",
    "Croatia": "Europe",
    "Vatican City": "Europe",
    "Monaco": "Europe",
    "Cyprus": "Europe",
    "Lithuania": "Europe",
    "Albania": "Europe",
    "Serbia": "Europe",
    "China": "Asia",
    "Japan": "Asia",
    "South Korea": "Asia",
    "India": "Asia",
    "Singapore": "Asia",
    "Thailand": "Asia",
    "Taiwan": "Asia",
    "Philippines": "Asia",
    "Hong Kong": "Asia",
    "Indonesia": "Asia",
    "Malaysia": "Asia",
    "Cambodia": "Asia",
    "Pakistan": "Asia",
    "United Arab Emirates": "Middle East",
    "Saudi Arabia": "Middle East",
    "Lebanon": "Middle East",
    "Iran": "Middle East",
    "Israel": "Middle East",
    "Turkey": "Middle East",
    "Jordan": "Middle East",
    "Qatar": "Middle East",
    "Kuwait": "Middle East",
    "Syria": "Middle East",
    "Palestine": "Middle East",
    "South Africa": "Africa",
    "Nigeria": "Africa",
    "Egypt": "Africa",
    "Kenya": "Africa",
    "Senegal": "Africa",
    "Morocco": "Africa",
    "Tunisia": "Africa",
    "Ghana": "Africa",
    "Zimbabwe": "Africa",
    "Australia": "Oceania",
    "New Zealand": "Oceania"
}

def get_regions_from_names(country_names):
    regions = {region_map[c] for c in country_names if c in region_map}
    if not regions:
        return np.nan
    return list(regions)

df['regions'] = df['production_countries'].apply(get_regions_from_names)

def ensure_list_of_strings(x):
    if isinstance(x, list):
        return x
    elif isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except:
            return [x]
    else:
        return []
    
df['genres'] = df['genres'].apply(ensure_list_of_strings)
df['production_countries'] = df['production_countries'].apply(ensure_list_of_strings)

df['production_countries'] = df['production_countries'].apply(lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x)
df['genres'] = df['genres'].apply(lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x)



df[['production_countries', 'regions']].head()

Unnamed: 0,production_countries,regions
0,[United States],[North America]
1,[United States],[North America]
2,[United States],[North America]
3,[United Kingdom],[Europe]
4,"[United Kingdom, United States]","[North America, Europe]"


In [78]:
df = df.drop(columns=['imdb_id'])
df.tail()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,regions
5845,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,"[romance, drama]",[Nigeria],,6.8,45.0,1.466,,[Africa]
5846,tm898842,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,134,[drama],,,7.7,348.0,,,
5847,tm1059008,Lokillo,MOVIE,A controversial TV host and comedian who has b...,2021,,90,[comedy],[Colombia],,3.8,68.0,26.005,6.3,[South America]
5848,tm1035612,Dad Stop Embarrassing Me - The Afterparty,MOVIE,"Jamie Foxx, David Alan Grier and more from the...",2021,PG-13,37,,[United States],,,,1.296,10.0,[North America]
5849,ts271048,Mighty Little Bheem: Kite Festival,SHOW,"With winter behind them, Bheem and his townspe...",2021,,7,"[family, animation, comedy]",,1.0,7.8,18.0,2.289,10.0,
