In [1]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

billionaire_metadata_path = "data/forbes_billionaires_geo.csv"

# Read the billionaire data
billionaire_metadata = pd.read_csv(billionaire_metadata_path)
bronze_billionaire_df = pd.DataFrame(billionaire_metadata) 
bronze_billionaire_df.head(2) #table is already sorted by net-worth

Unnamed: 0,Name,NetWorth,Country,Source,Rank,Age,Residence,Citizenship,Status,Children,Education,Self_made,geometry
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321)
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286)


In [2]:
bronze_billionaire_df.isnull().sum()

Name              0
NetWorth          0
Country           0
Source            0
Rank              0
Age             125
Residence        40
Citizenship      16
Status          665
Children       1203
Education      1346
Self_made        18
geometry          0
dtype: int64

In [3]:
#lower case all column names for postgreSQL
#Source: https://cmdlinetips.com/2020/07/cleaning_up_pandas-column-names/

bronze_billionaire_df.rename(columns=str.lower,inplace=True)

In [4]:
#persist the raw-unmodified data to the database

password="postgres"
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/Billionaire')
connection = engine.connect()
bronze_billionaire_df.to_sql(name='bronze_billionaire', con=engine, if_exists='replace', index=False)

In [5]:
#Clone to bronze table to proceed with work on the silver version of the data

silver_billionaire_df = bronze_billionaire_df.copy()

In [6]:
#Rename columns
silver_billionaire_df.rename(columns={"name"      : "display_name", 
                                      "source"    : "source_of_wealth",
                                      "rank"      : "wealth_rank",
                                      "status"    : "relationship_status",
                                      "self_Made" : "is_self_made",
                                      "networth"  : "net_worth",
                                      "self_made" : "is_self_made",
                                      "geometry"  : "geometry"}, inplace=True)
silver_billionaire_df.head(2)

Unnamed: 0,display_name,net_worth,country,source_of_wealth,wealth_rank,age,residence,citizenship,relationship_status,children,education,is_self_made,geometry
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321)
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286)


In [7]:
#Check for duplicates in silver data and filter them out
#Source: https://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value
#dupes_df = pd.DataFrame(silver_billionaire_df.groupby('Display_Name').size().loc[lambda x: x>1].reset_index())

#dupes_list = dupes_df["Display_Name"]

#silver_billionaire_df = silver_billionaire_df.loc[~silver_billionaire_df["Display_Name"].isin(dupes_list)]


In [8]:
#Remove duplidates from Silver

#silver_billionaire_df.drop_duplicates(subset="Display_Name",keep=False,inplace=True)


In [9]:
#Drop billionaires from Silver without a value for Residence and Age

silver_billionaire_df.dropna(subset = ['residence','age'],inplace=True)
silver_billionaire_df.isnull().sum()

display_name              0
net_worth                 0
country                   0
source_of_wealth          0
wealth_rank               0
age                       0
residence                 0
citizenship               0
relationship_status     566
children               1080
education              1229
is_self_made              1
geometry                  0
dtype: int64

In [10]:
#Clean the Name column and remove reference for "& family" and "family"
#Source: https://stackoverflow.com/questions/37593550/replace-method-not-working-on-pandas-dataframe

silver_billionaire_df["display_name"].replace({"& family": '', "family": ''}, inplace=True, regex=True)
silver_billionaire_df["display_name"] = silver_billionaire_df["display_name"].str.strip()
silver_billionaire_df.head(2)

Unnamed: 0,display_name,net_worth,country,source_of_wealth,wealth_rank,age,residence,citizenship,relationship_status,children,education,is_self_made,geometry
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321)
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286)


In [11]:
#Create new columns for first name, middle name, and last name
df1 = pd.DataFrame(columns=['first_name','middle_name','last_Name','suffix'])
silver_billionaire_df = silver_billionaire_df.join(df1, how="outer")

In [12]:
#Explode the name column to populate first name, middle name, last name, and suffix where appropriate

for (idx, data_row) in silver_billionaire_df.iterrows():
    
    name_parts = data_row["display_name"].split()
    
    if len(name_parts) == 2:
        
        silver_billionaire_df.at[idx,'first_name'] = name_parts[0]
        silver_billionaire_df.at[idx,'last_name']  = name_parts[1]       

    elif len(name_parts) == 3:
        
        silver_billionaire_df.at[idx,'first_name']  = name_parts[0]
        
        if "," in name_parts[1]:
            
            cleaned_name = name_parts[1].replace(",",'')
            suffix = name_parts[2]
            
            silver_billionaire_df.at[idx,'first_name'] = name_parts[0]
            silver_billionaire_df.at[idx,'last_name']  = cleaned_name
            silver_billionaire_df.at[idx,'suffix']     = suffix         
            
        else:
            silver_billionaire_df.at[idx,'first_name']  = name_parts[0]
            silver_billionaire_df.at[idx,'middle_name'] = name_parts[1]
            silver_billionaire_df.at[idx,'last_name']   = name_parts[2]   

In [13]:
#Create new columns for longitude and latitude
df1 = pd.DataFrame(columns=['longitude','latitude'])
silver_billionaire_df = silver_billionaire_df.join(df1, how="outer")

In [14]:
#Pull out latitude and longitude from geometry column to aide future accessibility to those values

for (idx, data_row) in silver_billionaire_df.iterrows():

    point_parts = data_row["geometry"].split(" ")

    longitude = str(point_parts[1]).replace("(",'') #longitude
    latitude  = str(point_parts[2]).replace(")",'') #latitude
    
    silver_billionaire_df.at[idx,'longitude'] = longitude
    silver_billionaire_df.at[idx,'latitude']  = latitude     


In [15]:
silver_billionaire_df.head(2)

Unnamed: 0,display_name,net_worth,country,source_of_wealth,wealth_rank,age,residence,citizenship,relationship_status,children,education,is_self_made,geometry,first_name,middle_name,last_Name,suffix,last_name,longitude,latitude
0,Jeff Bezos,177.0,United States,Amazon,1,57.0,"Seattle, Washington",United States,In Relationship,4.0,"Bachelor of Arts/Science, Princeton University",True,POINT (-122.3300624 47.6038321),Jeff,,,,Bezos,-122.3300624,47.6038321
1,Elon Musk,151.0,United States,"Tesla, SpaceX",2,49.0,"Austin, Texas",United States,In Relationship,7.0,"Bachelor of Arts/Science, University of Pennsy...",True,POINT (-97.74369950000001 30.2711286),Elon,,,,Musk,-97.7436995,30.2711286


In [16]:
#reorder dataframe columns

#silver_billionaire_df = silver_billionaire_df.reindex(columns=['Display_Name',
                                                               #'First_Name',
                                                               #'Middle_Name',
                                                               #'Last_Name',
                                                               #'Suffix',
                                                               #'Net_Worth',
                                                               #'Country',
                                                               #'Residence',
                                                               #'Citizenship',
                                                               #'Source_Of_Wealth',
                                                               #'Wealth_Rank',
                                                               #'Age',
                                                               #'Relationship_Status',
                                                               #'Children',
                                                               #'Education',
                                                               #'Is_Self_Made',
                                                               #'Geometry',
                                                               #'Longitude',
                                                               #'Latitude'])


In [17]:
dupes_df = silver_billionaire_df.groupby('display_name').filter(lambda x: len(x) > 1).sort_values(by=['display_name'])
display(dupes_df)

Unnamed: 0,display_name,net_worth,country,source_of_wealth,wealth_rank,age,residence,citizenship,relationship_status,children,education,is_self_made,geometry,first_name,middle_name,last_Name,suffix,last_name,longitude,latitude
380,Jim Davis,6.9,United States,New Balance,380,77.0,"Newton, Massachusetts",United States,Married,2.0,"Bachelor of Arts/Science, Middlebury College",True,POINT (-71.2092214 42.3370414),Jim,,,,Davis,-71.2092214,42.3370414
894,Jim Davis,3.4,United States,staffing & recruiting,891,77.0,"Newton, Massachusetts",United States,Married,2.0,"Bachelor of Arts/Science, Middlebury College",True,POINT (-71.2092214 42.3370414),Jim,,,,Davis,-71.2092214,42.3370414
996,Jin Lei,3.1,China,pharmaceuticals,986,55.0,"Changchun, China",China,Married,,,True,POINT (125.3171216 43.8130735),Jin,,,,Lei,125.3171216,43.8130735
2571,Jin Lei,1.1,China,medical equipment,2524,55.0,"Changchun, China",China,Married,,,True,POINT (125.3171216 43.8130735),Jin,,,,Lei,125.3171216,43.8130735
693,Li Li,4.2,China,healthcare,680,57.0,"Shenzhen, China",China,Married,,"Bachelor of Arts/Science, Sichuan University",True,POINT (114.0543297 22.555454),Li,,,,Li,114.0543297,22.555454
1976,Li Li,1.6,China,pharmaceuticals,1931,57.0,"Shenzhen, China",China,Married,,"Bachelor of Arts/Science, Sichuan University",True,POINT (114.0543297 22.555454),Li,,,,Li,114.0543297,22.555454
1331,Robert Miller,2.4,United Kingdom,retail,1299,75.0,"Montreal, Canada",Canada,Divorced,2.0,"Bachelor of Arts/Science, Rider University",True,POINT (-73.61036420000001 45.4972159),Robert,,,,Miller,-73.6103642,45.4972159
1625,Robert Miller,2.0,Canada,electronics components,1580,75.0,"Montreal, Canada",Canada,Divorced,2.0,"Bachelor of Arts/Science, Rider University",True,POINT (-73.61036420000001 45.4972159),Robert,,,,Miller,-73.6103642,45.4972159
700,Wang Yanqing,4.2,China,electrical equipment,680,55.0,"Wuxi, China",China,Married,,,True,POINT (120.3074357 31.4933074),Wang,,,,Yanqing,120.3074357,31.4933074
1919,Wang Yanqing,1.7,China,carbon fiber products,1833,55.0,"Wuxi, China",China,Married,,,True,POINT (120.3074357 31.4933074),Wang,,,,Yanqing,120.3074357,31.4933074


In [18]:
# Remove all duplicated records by display name by filtering them out

dupes_df = pd.DataFrame(silver_billionaire_df.groupby('display_name').size().loc[lambda x: x>1].reset_index())
dupes_list = dupes_df["display_name"]
silver_billionaire_df = silver_billionaire_df.loc[~silver_billionaire_df["display_name"].isin(dupes_list)]

In [19]:
#lower case all column names for postgreSQL
#Source: https://cmdlinetips.com/2020/07/cleaning_up_pandas-column-names/

silver_billionaire_df.rename(columns=str.lower,inplace=True)

In [20]:
password="postgres"
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/Billionaire')
connection = engine.connect()
silver_billionaire_df.to_sql(name='silver_billionaire', con=engine, if_exists='append',index=False)