In [1]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

billionaire_metadata_path = "data/forbes_billionaires_geo.csv"

# Read the billionaire data
billionaire_metadata = pd.read_csv(billionaire_metadata_path)
bronze_billionaire_df = pd.DataFrame(billionaire_metadata) 

display(bronze_billionaire_df) #table is already sorted by net-worth

Unnamed: 0,Name,NetWorth,Country,Source,Rank,Age,Residence,Citizenship,Status,Children,Education,Self_made,geometry
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321)
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286)
2,Bernard Arnault & family,150.0,France,LVMH,3,72.0,"Paris, France",France,Married,5.0,"Bachelor of Arts/Science, Ecole Polytechnique ...",False,POINT (2.3514616 48.8566969)
3,Bill Gates,124.0,United States,Microsoft,4,65.0,"Medina, Washington",United States,Divorced,3.0,"Drop Out, Harvard University",True,POINT (-122.2264453 47.620548)
4,Mark Zuckerberg,97.0,United States,Facebook,5,36.0,"Palo Alto, California",United States,Married,2.0,"Drop Out, Harvard University",True,POINT (-122.1598465 37.4443293)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2750,Daniel Yong Zhang,1.0,China,e-commerce,2674,49.0,"Hangzhou, China",China,,,,True,POINT (120.2052342 30.2489634)
2751,Zhang Yuqiang,1.0,China,Fiberglass,2674,65.0,"Tongxiang, China",China,,,,True,POINT (120.5610365 30.6316971)
2752,Zhao Meiguang,1.0,China,gold mining,2674,58.0,"Jilin, China",China,,,,True,POINT (125.9816054 42.9995032)
2753,Zhong Naixiong,1.0,China,conglomerate,2674,58.0,"Foshan, China",China,,,,True,POINT (113.1146335 23.0247687)


In [2]:
bronze_billionaire_df.isnull().sum()

Name              0
NetWorth          0
Country           0
Source            0
Rank              0
Age             125
Residence        40
Citizenship      16
Status          665
Children       1203
Education      1346
Self_made        18
geometry          0
dtype: int64

In [3]:
#persist the raw-unmodified data to the database

password="postgres"
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/Billionaire')
connection = engine.connect()
bronze_billionaire_df.to_sql(name='bronze_billionaire', con=engine, if_exists='append', index=False)

In [4]:
#Clone to bronze table to proceed with work on the silver version of the data

silver_billionaire_df = bronze_billionaire_df.copy()

In [5]:
#Drop billionaires without a value for Residence and Age

silver_billionaire_df.dropna(subset = ['Residence','Age'],inplace=True)
silver_billionaire_df.isnull().sum()

Name              0
NetWorth          0
Country           0
Source            0
Rank              0
Age               0
Residence         0
Citizenship       0
Status          566
Children       1080
Education      1229
Self_made         1
geometry          0
dtype: int64

In [6]:
#Rename columns
silver_billionaire_df.rename(columns={"NetWorth" : "Net_Worth"}, inplace=True)
silver_billionaire_df.rename(columns={"Self_made": "Self_Made"}, inplace=True)
silver_billionaire_df.rename(columns={"geometry" : "Geometry"},  inplace=True)

In [7]:
#Clean the Name column and remove reference for "& family" and "family"
#Source: https://stackoverflow.com/questions/37593550/replace-method-not-working-on-pandas-dataframe

silver_billionaire_df["Name"].replace({"& family": '', "family": ''}, inplace=True, regex=True)
silver_billionaire_df["Name"] = silver_billionaire_df["Name"].str.strip()
#display(silver_billionare_df)

In [8]:
#Create new columns for first name, middle name, and last name
df1 = pd.DataFrame(columns=['First_Name','Middle_Name','Last_Name'])
silver_billionaire_df = silver_billionaire_df.join(df1, how="outer")

In [9]:
#Explode the name column to populate first, middle, and last where appropriate

for (idx, data_row) in silver_billionaire_df.iterrows():
    
    name_parts = data_row["Name"].split()
    
    if len(name_parts) == 2:
        
        silver_billionaire_df.at[idx,'First_Name'] = name_parts[0]
        silver_billionaire_df.at[idx,'Last_Name']  = name_parts[1]       

    elif len(name_parts) == 3:
        
        silver_billionaire_df.at[idx,'First_Name']  = name_parts[0]
        silver_billionaire_df.at[idx,'Middle_Name'] = name_parts[1]
        silver_billionaire_df.at[idx,'Last_Name']   = name_parts[2]       
        

In [10]:
#Create new columns for longitude and latitude
df1 = pd.DataFrame(columns=['Longitude','Latitude'])
silver_billionaire_df = silver_billionaire_df.join(df1, how="outer")

In [11]:

for (idx, data_row) in silver_billionaire_df.iterrows():

    point_parts = data_row["Geometry"].split(" ")

    longitude = float(str(point_parts[1]).replace("(",'')) #longitude
    latitude  = float(str(point_parts[2]).replace(")",'')) #latitude
    
    silver_billionaire_df.at[idx,'Longitude'] = longitude
    silver_billionaire_df.at[idx,'Latitude']  = latitude     


In [12]:
display(silver_billionaire_df)

Unnamed: 0,Name,Net_Worth,Country,Source,Rank,Age,Residence,Citizenship,Status,Children,Education,Self_Made,Geometry,First_Name,Middle_Name,Last_Name,Longitude,Latitude
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321),Jeff,,Bezos,-122.330062,47.603832
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286),Elon,,Musk,-97.7437,30.271129
2,Bernard Arnault,150.0,France,LVMH,3,72.0,"Paris, France",France,Married,5.0,"Bachelor of Arts/Science, Ecole Polytechnique ...",False,POINT (2.3514616 48.8566969),Bernard,,Arnault,2.351462,48.856697
3,Bill Gates,124.0,United States,Microsoft,4,65.0,"Medina, Washington",United States,Divorced,3.0,"Drop Out, Harvard University",True,POINT (-122.2264453 47.620548),Bill,,Gates,-122.226445,47.620548
4,Mark Zuckerberg,97.0,United States,Facebook,5,36.0,"Palo Alto, California",United States,Married,2.0,"Drop Out, Harvard University",True,POINT (-122.1598465 37.4443293),Mark,,Zuckerberg,-122.159847,37.444329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2750,Daniel Yong Zhang,1.0,China,e-commerce,2674,49.0,"Hangzhou, China",China,,,,True,POINT (120.2052342 30.2489634),Daniel,Yong,Zhang,120.205234,30.248963
2751,Zhang Yuqiang,1.0,China,Fiberglass,2674,65.0,"Tongxiang, China",China,,,,True,POINT (120.5610365 30.6316971),Zhang,,Yuqiang,120.561037,30.631697
2752,Zhao Meiguang,1.0,China,gold mining,2674,58.0,"Jilin, China",China,,,,True,POINT (125.9816054 42.9995032),Zhao,,Meiguang,125.981605,42.999503
2753,Zhong Naixiong,1.0,China,conglomerate,2674,58.0,"Foshan, China",China,,,,True,POINT (113.1146335 23.0247687),Zhong,,Naixiong,113.114633,23.024769


In [13]:
#reorder dataframe columns

silver_billionaire_df = silver_billionaire_df.reindex(columns=['Name',
                                                               'First_Name',
                                                               'Middle_Name',
                                                               'Last_Name',
                                                               'Net_Worth',
                                                               'Country',
                                                               'Source',
                                                               'Rank',
                                                               'Age',
                                                               'Residence',
                                                               'Citizenship',
                                                               'Status',
                                                               'Children',
                                                               'Education',
                                                               'Self_Made',
                                                               'Geometry',
                                                               'Longitude',
                                                               'Latitude'])

In [14]:
display(silver_billionaire_df)

Unnamed: 0,Name,First_Name,Middle_Name,Last_Name,Net_Worth,Country,Source,Rank,Age,Residence,Citizenship,Status,Children,Education,Self_Made,Geometry,Longitude,Latitude
0,Jeff Bezos,Jeff,,Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321),-122.330062,47.603832
1,Elon Musk,Elon,,Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286),-97.7437,30.271129
2,Bernard Arnault,Bernard,,Arnault,150.0,France,LVMH,3,72.0,"Paris, France",France,Married,5.0,"Bachelor of Arts/Science, Ecole Polytechnique ...",False,POINT (2.3514616 48.8566969),2.351462,48.856697
3,Bill Gates,Bill,,Gates,124.0,United States,Microsoft,4,65.0,"Medina, Washington",United States,Divorced,3.0,"Drop Out, Harvard University",True,POINT (-122.2264453 47.620548),-122.226445,47.620548
4,Mark Zuckerberg,Mark,,Zuckerberg,97.0,United States,Facebook,5,36.0,"Palo Alto, California",United States,Married,2.0,"Drop Out, Harvard University",True,POINT (-122.1598465 37.4443293),-122.159847,37.444329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2750,Daniel Yong Zhang,Daniel,Yong,Zhang,1.0,China,e-commerce,2674,49.0,"Hangzhou, China",China,,,,True,POINT (120.2052342 30.2489634),120.205234,30.248963
2751,Zhang Yuqiang,Zhang,,Yuqiang,1.0,China,Fiberglass,2674,65.0,"Tongxiang, China",China,,,,True,POINT (120.5610365 30.6316971),120.561037,30.631697
2752,Zhao Meiguang,Zhao,,Meiguang,1.0,China,gold mining,2674,58.0,"Jilin, China",China,,,,True,POINT (125.9816054 42.9995032),125.981605,42.999503
2753,Zhong Naixiong,Zhong,,Naixiong,1.0,China,conglomerate,2674,58.0,"Foshan, China",China,,,,True,POINT (113.1146335 23.0247687),113.114633,23.024769


In [15]:
password="postgres"
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/Billionaire')
connection = engine.connect()
silver_billionaire_df.to_sql(name='silver_billionaire', con=engine, if_exists='append', index=False)