# ETL: actors city and country

In [1]:
# Import the API key
from config import geoapify_key

# Dependencies for map
import requests
import json

# Dependencies and Setup
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np
import hvplot.pandas
import sqlite3

# Turn off warning messages
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Creating db connection
cnx = sqlite3.connect('../Server/movies_db.sqlite')

# Reading data from sqlite database table
actor_df = pd.read_sql_query("SELECT * FROM actor", cnx)

In [3]:
actor_df.head()

Unnamed: 0,actorid,name,date_of_birth,birth_city,birth_country,height_inches,biography,gender,ethnicity,networth
0,1,John Travolta,1954-02-18,Englewood,USA,74.0,"John Joseph Travolta was born in Englewood, Ne...",Male,White,250000000.0
1,2,Kirstie Alley,1951-01-12,Wichita,USA,67.0,,Female,White,40000000.0
2,3,Olympia Dukakis,1931-06-20,Lowell,USA,63.0,"Long a vital, respected lady of the classic an...",Female,White,6000000.0
3,4,George Segal,1934-02-13,Great Neck,USA,71.0,,Male,White,10000000.0
4,5,Abe Vigoda,1921-02-24,Brooklyn,USA,73.0,Abraham Charles Vigoda was an American actor k...,Male,White,10000000.0


##  Data Preperation

In [4]:
# Cleaning country names from dataset

actor_cleaned_df = actor_df.replace({'birth_country': {
    'Wales':'UK',
    'West Germany':'Germany',
    'Federal Republic of Germany':'Germany',
    'British Guiana':'Guyana',
    'Maharashtra': 'India',
    'British West Indies': 'Trinidad and Tobago',
    'Dolnoslaskie': 'Poland',
    'USSR': 'Russia',
    'French Protectorate of Morocco [now Morocco]': 'Morocco',
    'U.S. Virgin Islands':'Virgin Islands of the United States',
    'California':'USA',
    'British Malaya': 'Malaysia',
    'United Kingdom': 'UK',
    'England': 'UK',
    'Union of South Africa':'South Africa'    
}})
actor_cleaned_df['birth_country'].unique()

array(['USA', 'Germany', None, 'UK', 'Canada', 'Scotland', 'Lebanon',
       'Ireland', 'Belgium', 'Malaysia', 'Puerto Rico', 'Austria',
       'Italy', 'Venezuela', 'Cuba', 'Netherlands', 'Argentina',
       'Jamaica', 'Trinidad and Tobago', 'France', 'Swaziland', 'Poland',
       'Israel', 'Ukraine', 'Sweden', 'Japan', 'French Polynesia',
       'Australia', 'Portugal', 'India', 'Pakistan', 'South Africa',
       'Guyana', 'Honduras', 'Turkey', 'Morocco', 'Russia', 'Hong Kong',
       'China', 'Mexico', 'Nigeria', 'New Zealand',
       'Virgin Islands of the United States', 'Ghana', 'Taiwan', 'Cyprus',
       'Denmark', 'Benin', 'Yugoslavia', 'Spain', 'Panama', 'Slovenia',
       'Colombia', 'Czechoslovakia', 'Oman', 'Ecuador', 'Indonesia',
       'Switzerland', 'Romania', 'Bulgaria', 'Kenya', 'Ivory Coast',
       'Brazil', 'Iran', 'South Korea', 'Singapore', 'Finland', 'Yemen',
       'Channel Islands'], dtype=object)

In [5]:
# Checking for data count in each column

actor_cleaned_df.count()

actorid          2599
name             2599
date_of_birth    2387
birth_city       2330
birth_country    2317
height_inches    2094
biography        2550
gender           1365
ethnicity         659
networth         1408
dtype: int64

In [6]:
# Keeping only rows which have networth data

actor_cleaned_df = actor_cleaned_df.loc[pd.notnull(actor_cleaned_df['networth'])]

# Finding rows with missing country data but birth city data available. Inseting country data

actor_cleaned_df.loc[(pd.isnull(actor_cleaned_df['birth_country'])) & (pd.notnull(actor_cleaned_df['birth_city'])), 'birth_country' ] = actor_cleaned_df['birth_city']

# Removing any remaining rows with no country information

actor_cleaned_df = actor_cleaned_df.loc[(pd.notnull(actor_cleaned_df['birth_country']))]
actor_cleaned_df

Unnamed: 0,actorid,name,date_of_birth,birth_city,birth_country,height_inches,biography,gender,ethnicity,networth
0,1,John Travolta,1954-02-18,Englewood,USA,74.0,"John Joseph Travolta was born in Englewood, Ne...",Male,White,250000000.0
1,2,Kirstie Alley,1951-01-12,Wichita,USA,67.0,,Female,White,40000000.0
2,3,Olympia Dukakis,1931-06-20,Lowell,USA,63.0,"Long a vital, respected lady of the classic an...",Female,White,6000000.0
3,4,George Segal,1934-02-13,Great Neck,USA,71.0,,Male,White,10000000.0
4,5,Abe Vigoda,1921-02-24,Brooklyn,USA,73.0,Abraham Charles Vigoda was an American actor k...,Male,White,10000000.0
...,...,...,...,...,...,...,...,...,...,...
2591,2593,Toby Kebbell,1982-07-09,Pontefract,UK,72.0,"Toby Kebbell was born in 1982 in Pontefract, Y...",Male,,6000000.0
2593,2595,Kirk Acevedo,1971-11-27,Brooklyn,USA,68.0,"Born and raised in the Bronx, and spent most o...",Male,,4000000.0
2595,2597,Chris Pratt,1979-06-21,Virginia,USA,74.0,Christopher Michael Pratt is an American film ...,Male,White,60000000.0
2597,2599,Lee Pace,1979-03-25,Chickasha,USA,77.0,"In 2003, Lee Grinner Pace starred in the Sunda...",Male,White,5000000.0


In [7]:
# Total unique countries in cleaned dataset

print(f"Total unique countries in cleaned dataset: {actor_cleaned_df['birth_country'].nunique()}")

Total unique countries in cleaned dataset: 50


In [8]:
# Determine top city in each country for map representation

country_city_df = actor_cleaned_df[['birth_country','birth_city']]
country_city_df.loc[:,'counter'] = 1
country_city_df

Unnamed: 0,birth_country,birth_city,counter
0,USA,Englewood,1
1,USA,Wichita,1
2,USA,Lowell,1
3,USA,Great Neck,1
4,USA,Brooklyn,1
...,...,...,...
2591,UK,Pontefract,1
2593,USA,Brooklyn,1
2595,USA,Virginia,1
2597,USA,Chickasha,1


In [9]:
# Steps to determine most occuring city per country for map 

country_city_df = country_city_df.set_index(["birth_country", "birth_city"])
country_city_count = country_city_df.groupby(["birth_country", "birth_city"])["counter"].sum()
country_city_df['city_count'] = country_city_count

# Removing duplicates from dataframe

country_city_df2 = pd.DataFrame([])
country_city_df2 = country_city_df.reset_index()
country_city_df2 = country_city_df2.drop(['counter'], axis=1)
country_city_df2 = country_city_df2.drop_duplicates()

In [10]:
# Sorting df to keep only most popular birth_city for each country
country_city_df2 = country_city_df2.sort_values(['birth_country'],ascending=True)
country_city_df2 = country_city_df2.set_index(['birth_country'])
country_city_df2 = country_city_df2.sort_values(['birth_country','city_count'],ascending=False)

# Retaining only most popular city per country
country_city_df3 = pd.DataFrame([])
country_city_df3 = country_city_df2.groupby(['birth_country',]).nth(0)

In [11]:
# Display df
country_city_df3

Unnamed: 0_level_0,birth_city,city_count
birth_country,Unnamed: 1_level_1,Unnamed: 2_level_1
Australia,Melbourne,8.0
Austria,Klagenfurt,1.0
Belgium,Bree,1.0
Benin,Cotonou,1.0
Brazil,Petr�polis,1.0
Canada,Vancouver,8.0
Channel Islands,Jersey,1.0
China,Beijing,1.0
Cuba,Santiago de Cuba,1.0
Denmark,�sterbro,1.0


In [12]:
# Cleaning city names from dataset for improving chances to get coordinates from geoapify

country_city_cleand_df = country_city_df3.replace({'birth_city': {
     'Bree':'Brussels',
     'Elling':'Copenhagen',
     'Rosenheim':'Berlin',
     'Victoria Peak':'Hong Kong',
     'Bombay':'Mumbai',
     'Panama City':'Panama',
     'Singapore City':'Singapore',
     'Worblaufen BE':'Bern',
     'New York City':'New York',
     'Chernovtsy':'Chernivtsi'
}})

country_city_cleand_df.loc[(country_city_cleand_df.index == "Brazil"),"birth_city" ] = "Rio de Janeiro"
country_city_cleand_df.loc[(country_city_cleand_df.index == "Portugal"),"birth_city" ] = "Lisbon"
country_city_cleand_df.loc[(country_city_cleand_df.index == "Poland"),"birth_city" ] = "Warsaw"
country_city_cleand_df.loc[(country_city_cleand_df.index == "Denmark"),"birth_city" ] = "Copenhagen"

country_city_cleand_df = country_city_cleand_df.drop(['city_count'], axis=1)

country_city_cleand_df


Unnamed: 0_level_0,birth_city
birth_country,Unnamed: 1_level_1
Australia,Melbourne
Austria,Klagenfurt
Belgium,Brussels
Benin,Cotonou
Brazil,Rio de Janeiro
Canada,Vancouver
Channel Islands,Jersey
China,Beijing
Cuba,Santiago de Cuba
Denmark,Copenhagen


### Retrieving location coordinates from geoapify

In [13]:
# Additing additional columns to plotted data frome from previous step
country_city_cleand_df = country_city_cleand_df.reset_index()
country_city_cleand_df[["Lat",'Lon']] = ""

# Defining the API parameters
params = {
    "apiKey":geoapify_key,
    "format":"json"
}

# Setingt the base URL
base_url = "https://api.geoapify.com/v1/geocode/search"

# Looping through the country_city_cleand_df DataFrame and search coordinates for each city
for index, row in country_city_cleand_df.iterrows():

    city = row["birth_city"]

    # Add the current city to the parameters
    params["text"] = f"{city}"

    # Making the API request
    response = requests.get(base_url, params=params)
    
    # Converting reponse to JSON
    response = response.json()
    
    # Extracting latitude and longitude
    try:
        country_city_cleand_df.loc[index, "Lat"] = response["results"][0]["lat"]
        country_city_cleand_df.loc[index, "Lon"] = response["results"][0]["lon"]  
        
    except KeyError as e:
    # If no data is found, log the error.
        print(f"{e.args[0]} not found for {country_city_cleand_df.loc[index, 'birth_city']}")
        
country_city_cleand_df

Unnamed: 0,birth_country,birth_city,Lat,Lon
0,Australia,Melbourne,-37.814245,144.963173
1,Austria,Klagenfurt,46.623943,14.307598
2,Belgium,Brussels,50.846557,4.351697
3,Benin,Cotonou,6.367695,2.425251
4,Brazil,Rio de Janeiro,-22.911014,-43.209373
5,Canada,Vancouver,49.260872,-123.113952
6,Channel Islands,Jersey,49.221456,-2.135839
7,China,Beijing,39.905714,116.391297
8,Cuba,Santiago de Cuba,20.021464,-75.829493
9,Denmark,Copenhagen,55.686724,12.570072


In [14]:
# Data correction for two coordinates

country_city_cleand_df.loc[(country_city_cleand_df['birth_country'] == "Trinidad and Tobago"),"Lat" ] = -14.8349438
country_city_cleand_df.loc[(country_city_cleand_df['birth_country'] == "Trinidad and Tobago"),"Lon" ] = -64.9044936
country_city_cleand_df.loc[(country_city_cleand_df['birth_country'] == "Virgin Islands of the United States"),"Lat" ] = 18.338097
country_city_cleand_df.loc[(country_city_cleand_df['birth_country'] == "Virgin Islands of the United States"),"Lon" ] = -64.894095


### Save to CSV

In [16]:
# Saving data to csv
country_city_cleand_df.to_csv("../Datasets/country_coordinates.csv")