In [20]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

billionaire_metadata_path = "data/forbes_billionaires_geo.csv"

# Read the billionaire data
billionaire_metadata = pd.read_csv(billionaire_metadata_path)
bronze_billionaire_df = pd.DataFrame(billionaire_metadata) 
bronze_billionaire_df.head(2) #table is already sorted by net-worth

Unnamed: 0,Name,NetWorth,Country,Source,Rank,Age,Residence,Citizenship,Status,Children,Education,Self_made,geometry
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321)
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286)


In [21]:
bronze_billionaire_df.isnull().sum()

Name              0
NetWorth          0
Country           0
Source            0
Rank              0
Age             125
Residence        40
Citizenship      16
Status          665
Children       1203
Education      1346
Self_made        18
geometry          0
dtype: int64

In [22]:
#persist the raw-unmodified data to the database

password="postgres"
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/Billionaire')
connection = engine.connect()
bronze_billionaire_df.to_sql(name='bronze_billionaire', con=engine, if_exists='replace', index=False)

In [23]:
#Clone to bronze table to proceed with work on the silver version of the data

silver_billionaire_df = bronze_billionaire_df.copy()

In [24]:
#Rename columns
silver_billionaire_df.rename(columns={"Name"      : "Display_Name", 
                                      "Source"    : "Source_Of_Wealth",
                                      "Rank"      : "Wealth_Rank",
                                      "Status"    : "Relationship_Status",
                                      "Self_Made" : "Is_Self_Made",
                                      "NetWorth"  : "Net_Worth",
                                      "Self_made" : "Is_Self_Made",
                                      "geometry"  : "Geometry"},inplace=True)
silver_billionaire_df.head(2)

Unnamed: 0,Display_Name,Net_Worth,Country,Source_Of_Wealth,Wealth_Rank,Age,Residence,Citizenship,Relationship_Status,Children,Education,Is_Self_Made,Geometry
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321)
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286)


In [25]:
#Check for duplicates in silver data

dupes_df = silver_billionaire_df.groupby('Display_Name').filter(lambda x: len(x) > 1).sort_values(by=['Display_Name'])
display(dupes_df)

Unnamed: 0,Display_Name,Net_Worth,Country,Source_Of_Wealth,Wealth_Rank,Age,Residence,Citizenship,Relationship_Status,Children,Education,Is_Self_Made,Geometry
693,Li Li,4.2,China,healthcare,680,57.0,"Shenzhen, China",China,Married,,"Bachelor of Arts/Science, Sichuan University",True,POINT (114.0543297 22.555454)
1976,Li Li,1.6,China,pharmaceuticals,1931,57.0,"Shenzhen, China",China,Married,,"Bachelor of Arts/Science, Sichuan University",True,POINT (114.0543297 22.555454)
1331,Robert Miller,2.4,United Kingdom,retail,1299,75.0,"Montreal, Canada",Canada,Divorced,2.0,"Bachelor of Arts/Science, Rider University",True,POINT (-73.61036420000001 45.4972159)
1625,Robert Miller,2.0,Canada,electronics components,1580,75.0,"Montreal, Canada",Canada,Divorced,2.0,"Bachelor of Arts/Science, Rider University",True,POINT (-73.61036420000001 45.4972159)
700,Wang Yanqing & family,4.2,China,electrical equipment,680,55.0,"Wuxi, China",China,Married,,,True,POINT (120.3074357 31.4933074)
1919,Wang Yanqing & family,1.7,China,carbon fiber products,1833,55.0,"Wuxi, China",China,Married,,,True,POINT (120.3074357 31.4933074)


In [26]:
#Remove duplidates from Silver

silver_billionaire_df.drop_duplicates(subset="Display_Name",keep=False,inplace=True)

In [27]:
#Drop billionaires from Silver without a value for Residence and Age

silver_billionaire_df.dropna(subset = ['Residence','Age'],inplace=True)
silver_billionaire_df.isnull().sum()

Display_Name              0
Net_Worth                 0
Country                   0
Source_Of_Wealth          0
Wealth_Rank               0
Age                       0
Residence                 0
Citizenship               0
Relationship_Status     566
Children               1076
Education              1227
Is_Self_Made              1
Geometry                  0
dtype: int64

In [28]:
#Clean the Name column and remove reference for "& family" and "family"
#Source: https://stackoverflow.com/questions/37593550/replace-method-not-working-on-pandas-dataframe

silver_billionaire_df["Display_Name"].replace({"& family": '', "family": ''}, inplace=True, regex=True)
silver_billionaire_df["Display_Name"] = silver_billionaire_df["Display_Name"].str.strip()
silver_billionaire_df.head(2)

Unnamed: 0,Display_Name,Net_Worth,Country,Source_Of_Wealth,Wealth_Rank,Age,Residence,Citizenship,Relationship_Status,Children,Education,Is_Self_Made,Geometry
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321)
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286)


In [29]:
#Create new columns for first name, middle name, and last name
df1 = pd.DataFrame(columns=['First_Name','Middle_Name','Last_Name','Suffix'])
silver_billionaire_df = silver_billionaire_df.join(df1, how="outer")

In [30]:
#Explode the name column to populate first name, middle name, last name, and suffix where appropriate

for (idx, data_row) in silver_billionaire_df.iterrows():
    
    name_parts = data_row["Display_Name"].split()
    
    if len(name_parts) == 2:
        
        silver_billionaire_df.at[idx,'First_Name'] = name_parts[0]
        silver_billionaire_df.at[idx,'Last_Name']  = name_parts[1]       

    elif len(name_parts) == 3:
        
        silver_billionaire_df.at[idx,'First_Name']  = name_parts[0]
        
        if "," in name_parts[1]:
            
            cleaned_name = name_parts[1].replace(",",'')
            suffix = name_parts[2]
            
            silver_billionaire_df.at[idx,'First_Name'] = name_parts[0]
            silver_billionaire_df.at[idx,'Last_Name']  = cleaned_name
            silver_billionaire_df.at[idx,'Suffix']     = suffix         
            
        else:
            silver_billionaire_df.at[idx,'First_Name']  = name_parts[0]
            silver_billionaire_df.at[idx,'Middle_Name'] = name_parts[1]
            silver_billionaire_df.at[idx,'Last_Name']   = name_parts[2]   

In [31]:
#Create new columns for longitude and latitude
df1 = pd.DataFrame(columns=['Longitude','Latitude'])
silver_billionaire_df = silver_billionaire_df.join(df1, how="outer")

In [32]:
#Pull out latitude and longitude from geometry column to aide future accessibility to those values

for (idx, data_row) in silver_billionaire_df.iterrows():

    point_parts = data_row["Geometry"].split(" ")

    longitude = str(point_parts[1]).replace("(",'') #longitude
    latitude  = str(point_parts[2]).replace(")",'') #latitude
    
    silver_billionaire_df.at[idx,'Longitude'] = longitude
    silver_billionaire_df.at[idx,'Latitude']  = latitude     


In [33]:
silver_billionaire_df.head(2)

Unnamed: 0,Display_Name,Net_Worth,Country,Source_Of_Wealth,Wealth_Rank,Age,Residence,Citizenship,Relationship_Status,Children,Education,Is_Self_Made,Geometry,First_Name,Middle_Name,Last_Name,Suffix,Longitude,Latitude
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321),Jeff,,Bezos,,-122.3300624,47.6038321
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286),Elon,,Musk,,-97.7436995,30.2711286


In [34]:
#reorder dataframe columns

#silver_billionaire_df = silver_billionaire_df.reindex(columns=['Display_Name',
                                                               #'First_Name',
                                                               #'Middle_Name',
                                                               #'Last_Name',
                                                               #'Suffix',
                                                               #'Net_Worth',
                                                               #'Country',
                                                               #'Residence',
                                                               #'Citizenship',
                                                               #'Source_Of_Wealth',
                                                               #'Wealth_Rank',
                                                               #'Age',
                                                               #'Relationship_Status',
                                                               #'Children',
                                                               #'Education',
                                                               #'Is_Self_Made',
                                                               #'Geometry',
                                                               #'Longitude',
                                                               #'Latitude'])


In [37]:
password="postgres"
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/Billionaire')
connection = engine.connect()
silver_billionaire_df.to_sql(name='silver_billionaire', con=engine, if_exists='append',index='Display_Name')

ValueError: duplicate name in index/columns: cannot insert Display_Name, already exists