<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [22]:
import geocoder
from scipy.stats import norm
import requests, zipfile, io, os
import datetime as dt
import pandas as pd
import numpy as np
import seaborn as sns
import sys, typing
from bs4 import BeautifulSoup
import requests
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import matplotlib.dates as mdates
from typing import List, Tuple
r = requests.get('https://www.ssa.gov/oact/babynames/names.zip')
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("names")

In [8]:
# Import dataset of users with age, profession, height, location, and gender 
placeholder_dict = {"Name": "Sofia", "Age": "24", "Profession":"Teacher", "Height":"5'2", "Location":"Las Vegas", "Gender":"F"}

In [33]:
import qwikidata
import qwikidata.sparql

def get_city_wikidata(city, country):
    query = """
    SELECT ?city ?cityLabel ?country ?countryLabel ?population
    WHERE
    {
      ?city rdfs:label '%s'@en.
      ?city wdt:P1082 ?population.
      ?city wdt:P17 ?country.
      ?city rdfs:label ?cityLabel.
      ?country rdfs:label ?countryLabel.
      FILTER(LANG(?cityLabel) = "en").
      FILTER(LANG(?countryLabel) = "en").
      FILTER(CONTAINS(?countryLabel, "%s")).
    }
    """ % (city, country)

    res = qwikidata.sparql.return_sparql_query_results(query)
    out = res['results']['bindings'][0]
    return out

In [49]:
# Location
g = get_city_wikidata(placeholder_dict["Location"], 'America')
# Obtain the number of people in that location as a baseline for uniqueness 
num_unique = int(g['population']['value'])
print(num_unique)

648224


In [51]:
# First Name, Age
# Assumes that videos are posted this year for age
# Assumes location is in the US and area follows normal distribution for ages in the US 
# Use problem set one to set the probability that someone with their name and gender would be there age 


def create_name_popularity_df(name_list: List[str], sex: str) -> pd.DataFrame:
    # Formats and extracts for each year the amount of people born with a particular name.
    # INPUT
    #   name_list : list of names. They must be of the same gender
    #   sex : 'F' for Female, 'M' for Male. 
    # OUTPUT
    #   names_df: pandas DataFrame indexed by year with names for columns
    names_df = pd.DataFrame()
    for year in range(1900, 2021):
        year_table = pd.read_csv("names/yob%s.txt" % str(year),sep=',',header=None,names=['name','sex','n'])
        year_table = year_table[year_table['name'].isin(name_list)]
        year_table = year_table[year_table['sex']==sex]
        if year_table.shape[0]==0:
            year_table['name']=name_list
        year_table['year'] = year
        names_df = names_df.append(year_table)
    names_df = names_df.pivot(index='year', columns='name', values='n')
    names_df = names_df.fillna(0)
    return names_df

def get_actuarial_table(year: int, sex: str) -> List[float]:
    # Returns a list of probabilities where entry x means the probability of being alive at time 'year+x' conditional
    # on being born at time 'year'. The list has 121 entries.
    # INPUT
    #   year : year of birth
    #   sex : 'F' for Female, 'M' for Male.
    # OUTPUT
    #   survival_probability : probability of surviving x years at entry x.
    if sex == "M":
        sex_col = 2
    elif sex == "F":
        sex_col = 10
    else:
        print("sex should be either M or F")
        sys.exit()
    table_page = requests.get("https://www.ssa.gov/oact/NOTES/as120/LifeTables_Tbl_7_%s.html" % year)
    soup = BeautifulSoup(table_page.content, 'html.parser')
    table = soup.find_all("tr")[6:]
    survival_probability = []
    for row in table:
        col = row.find_all('td')
        if len(col)>6:
            cell = col[sex_col].find('div')
        if cell is not None:
            survival_probability.append(float(cell.get_text().strip('\n').replace(',',''))/100000.0)
    return survival_probability

df = create_name_popularity_df([placeholder_dict["Name"]], placeholder_dict["Gender"])
year_born = 2020-int(placeholder_dict["Age"])
number_of_person_with_name_age = df.loc[df.index == year_born]["Sofia"].iloc[0]
US_pop = 329500000

# Assumes age and name distribution are consistent everywhere 
p = number_of_person_with_name/US_pop
num_unique = p*num_unique
print(num_unique)


0.00031736633363768993


In [None]:
# Profession
# Height