In [335]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re

In [336]:
df = pd.read_csv('./billionaires.csv')

pd.set_option('display.max_columns', None)
df

Unnamed: 0,name,rank,year,company.founded,company.name,company.relationship,company.sector,company.type,demographics.age,demographics.gender,location.citizenship,location.country code,location.gdp,location.region,wealth.type,wealth.worth in billions,wealth.how.category,wealth.how.from emerging,wealth.how.industry,wealth.how.inherited,wealth.how.was founder,wealth.how.was political
0,Bill Gates,1,1996,1975,Microsoft,founder,Software,new,40,male,United States,USA,8.100000e+12,North America,founder non-finance,18.5,New Sectors,True,Technology-Computer,not inherited,True,True
1,Bill Gates,1,2001,1975,Microsoft,founder,Software,new,45,male,United States,USA,1.060000e+13,North America,founder non-finance,58.7,New Sectors,True,Technology-Computer,not inherited,True,True
2,Bill Gates,1,2014,1975,Microsoft,founder,Software,new,58,male,United States,USA,0.000000e+00,North America,founder non-finance,76.0,New Sectors,True,Technology-Computer,not inherited,True,True
3,Warren Buffett,2,1996,1962,Berkshire Hathaway,founder,Finance,new,65,male,United States,USA,8.100000e+12,North America,founder non-finance,15.0,Traded Sectors,True,Consumer,not inherited,True,True
4,Warren Buffett,2,2001,1962,Berkshire Hathaway,founder,Finance,new,70,male,United States,USA,1.060000e+13,North America,founder non-finance,32.3,Traded Sectors,True,Consumer,not inherited,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2609,Wu Chung-Yi,1565,2014,1991,Tingyi,investor,beverages and food,new,55,male,Taiwan,Taiwan,0.000000e+00,East Asia,executive,1.0,Traded Sectors,True,Non-consumer industrial,not inherited,True,True
2610,Wu Xiong,1565,2014,1999,Biostime International Holdings,owner,infant formula,new,0,male,China,CHN,0.000000e+00,East Asia,executive,1.0,Traded Sectors,True,Consumer,not inherited,True,True
2611,Yang Keng,1565,2014,0,Blue Ray Corp,chairman,real estate,new,53,male,China,CHN,0.000000e+00,East Asia,self-made finance,1.0,Financial,True,Real Estate,not inherited,True,True
2612,Zdenek Bakala,1565,2014,1994,Patria Finance,founder,coal,new,53,male,Czech Republic,CZE,0.000000e+00,Europe,privatized and resources,1.0,Resource Related,True,Mining and metals,not inherited,True,True


In [337]:
print("The dataset consists of %d features and %d instances." %df.shape)

The dataset consists of 2614 features and 22 instances.


In [338]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2614 entries, 0 to 2613
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      2614 non-null   object 
 1   rank                      2614 non-null   int64  
 2   year                      2614 non-null   int64  
 3   company.founded           2614 non-null   int64  
 4   company.name              2576 non-null   object 
 5   company.relationship      2568 non-null   object 
 6   company.sector            2591 non-null   object 
 7   company.type              2578 non-null   object 
 8   demographics.age          2614 non-null   int64  
 9   demographics.gender       2580 non-null   object 
 10  location.citizenship      2614 non-null   object 
 11  location.country code     2614 non-null   object 
 12  location.gdp              2614 non-null   float64
 13  location.region           2614 non-null   object 
 14  wealth.t

# Preprocessing
This includes data cleanse, filling missing data, and removing duplicates.

## Cleaning up features

### Common touches

In [None]:
from difflib import SequenceMatcher

def similar(word_list):
    for i in range(len(word_list)):
        first_word = word_list[i]
        if pd.isna(first_word):
            continue
        print(first_word, '\n', '-'*30)
        res = []
        for j in range(i+1, len(word_list)):
            second_word = word_list[j]
            if pd.isna(second_word):
                continue
            if SequenceMatcher(None, first_word, second_word).ratio() > .7:
                res = res + [second_word]
        if len(res) > 0:
            print(res, '\n\n')

In [339]:
# Removing trailing and leading spaces from names
# Removing all consecutive spaces in the middle of the names
# Removing all extra dots

for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].apply(lambda x: x.strip().replace('..', '.').replace('  ', ' ') if isinstance(x, str) else x)

### wealth.how.was political

In [340]:
df['wealth.how.was political'].value_counts()

True    2614
Name: wealth.how.was political, dtype: int64

### wealth.how.was founder

In [341]:
df['wealth.how.was founder'].value_counts()

True    2614
Name: wealth.how.was founder, dtype: int64

### wealth.how.inherited

In [342]:
df['wealth.how.inherited'].value_counts()

not inherited               1688
father                       558
3rd generation               210
4th generation                68
spouse/widow                  59
5th generation or longer      31
Name: wealth.how.inherited, dtype: int64

### wealth.how.industry

In [343]:
df['wealth.how.industry'].value_counts()

Consumer                           471
Retail, Restaurant                 281
Real Estate                        280
Money Management                   249
Media                              219
Technology-Computer                208
Diversified financial              167
Energy                             132
Technology-Medical                 111
Non-consumer industrial            107
Constrution                         97
Mining and metals                   90
Other                               83
Hedge funds                         67
Private equity/leveraged buyout     25
0                                   16
Venture Capital                      8
banking                              1
services                             1
Name: wealth.how.industry, dtype: int64

In [344]:
df['wealth.how.industry'] = df['wealth.how.industry'].replace(
    {
        '0': 'Unspecified',
        'Retail, Restaurant': 'Retail/Restaurant',
        'Technology-Computer': 'Technology/Computer',
        'Technology-Medical': 'Technology/Medical',
    }
)

In [345]:
df['wealth.how.industry'].isna().sum()

1

In [346]:
df['wealth.how.industry'] = df['wealth.how.industry'].fillna('Unspecified')

### wealth.how.from emerging

In [347]:
df['wealth.how.from emerging'].value_counts()

True    2614
Name: wealth.how.from emerging, dtype: int64

### wealth.how.category

In [348]:
df['wealth.how.category'].value_counts()

Financial             800
Non-Traded Sectors    597
Traded Sectors        564
New Sectors           319
Resource Related      245
0                      85
Finance                 1
Trucking                1
energy                  1
Name: wealth.how.category, dtype: int64

In [349]:
df['wealth.how.category'] = df['wealth.how.category'].replace(
    {
        '0': 'Unspecified'
    }
)

In [350]:
df['wealth.how.category'].isna().sum()

1

In [351]:
df['wealth.how.category'] = df['wealth.how.category'].fillna('Unspecified')

### wealth.type

In [352]:
df['wealth.type'].value_counts()

inherited                   953
founder non-finance         713
self-made finance           500
privatized and resources    236
executive                   190
Name: wealth.type, dtype: int64

In [353]:
df['wealth.type'].isna().sum()

22

In [354]:
df['wealth.type'] = df['wealth.type'].fillna('Unspecified')

### location.region

In [355]:
df['location.region'].value_counts()

North America               992
Europe                      698
East Asia                   535
Latin America               182
Middle East/North Africa    117
South Asia                   69
Sub-Saharan Africa           20
0                             1
Name: location.region, dtype: int64

In [356]:
df['location.region'] = df['location.region'].replace(
    {
        '0': 'Unspecified'
    }
)

In [357]:
df['location.region'].isna().sum()

0

### location.country code

In [376]:
df['location.country code'].sort_values().unique()

array(['AGO', 'ARE', 'ARG', 'AUS', 'AUT', 'BEL', 'BHR', 'BMU', 'BRA',
       'CAN', 'CHE', 'CHL', 'CHN', 'COL', 'CYP', 'CZE', 'DEN', 'DEU',
       'DNK', 'DZA', 'ECU', 'EGY', 'ESP', 'FIN', 'FRA', 'GBR', 'GEO',
       'GGY', 'GRC', 'HKG', 'IDN', 'IND', 'IRL', 'ISR', 'ITA', 'JPN',
       'KAZ', 'KNA', 'KOR', 'KWT', 'LBN', 'LIE', 'LTU', 'MAC', 'MAR',
       'MCO', 'MEX', 'MYS', 'NGA', 'NLD', 'NOR', 'NPL', 'NZL', 'OMN',
       'PER', 'PHL', 'POL', 'PRT', 'ROU', 'RUS', 'SAU', 'SGP', 'SWE',
       'SWZ', 'THA', 'TUR', 'TZA', 'Taiwan', 'UGA', 'UKR', 'USA', 'VEN',
       'VNM', 'ZAF'], dtype=object)

In [359]:
df['location.country code'].isna().sum()

0

### location.citizenship

In [377]:
df['location.citizenship'].sort_values().unique()

array(['Algeria', 'Angola', 'Argentina', 'Australia', 'Austria',
       'Bahrain', 'Belgium', 'Bermuda', 'Brazil', 'Canada', 'Chile',
       'China', 'Colombia', 'Cyprus', 'Czech Republic', 'Denmark',
       'Ecuador', 'Egypt', 'Finland', 'France', 'Georgia', 'Germany',
       'Greece', 'Guernsey', 'Hong Kong', 'India', 'Indonesia', 'Ireland',
       'Israel', 'Italy', 'Japan', 'Kazakhstan', 'Kuwait', 'Lebanon',
       'Liechtenstein', 'Lithuania', 'Macau', 'Malaysia', 'Mexico',
       'Monaco', 'Morocco', 'Nepal', 'Netherlands', 'New Zealand',
       'Nigeria', 'Norway', 'Oman', 'Peru', 'Philippines', 'Poland',
       'Portugal', 'Romania', 'Russia', 'Saudi Arabia', 'Singapore',
       'South Africa', 'South Korea', 'Spain', 'St. Kitts and Nevis',
       'Swaziland', 'Sweden', 'Switzerland', 'Taiwan', 'Tanzania',
       'Thailand', 'Turkey', 'Uganda', 'Ukraine', 'United Arab Emirates',
       'United Kingdom', 'United States', 'Venezuela', 'Vietnam'],
      dtype=object)

In [361]:
df['location.citizenship'].isna().sum()

0

### demographics.gender

In [362]:
df['demographics.gender'].value_counts()

male              2328
female             249
married couple       3
Name: demographics.gender, dtype: int64

In [363]:
df['demographics.gender'].isna().sum()

34

In [364]:
df['demographics.gender'] = df['demographics.gender'].fillna('Unspecified')

### company.type

In [378]:
df['company.type'].sort_values().unique()

array(['Unspecified', 'acquired', 'franchise', 'joint venture', 'merger',
       'new', 'new/aquired', 'new/privatized', 'privatized',
       'state owned enterprise', 'subsidiary'], dtype=object)

In [366]:
df['company.type'] = df['company.type'].replace(
    {
        'aquired': 'acquired',
        'franchise rights': 'franchise',
        'neew': 'new',
        'new division': 'new',
        'new, privitization': 'new/privatized',
        'privatization': 'privatized'
    }
)

In [367]:
df['company.type'].isna().sum()

36

In [368]:
df['company.type'] = df['company.type'].fillna('Unspecified')

### company.sector

In [379]:
df['company.sector'].sort_values().unique()

array(['GPS technology', 'HR consulting', 'IT Consulting', 'Star Wars',
       'advertising', 'aerospace and defense', 'agribusiness',
       'agricultural', 'agricultural products', 'air compressors',
       'aircraft leasing', 'airline', 'airplanes', 'airport',
       'airport maintenance', 'alcohol', 'aluminum', 'aluminum/oil',
       'animal feed', 'animation', 'antennas', 'apparel',
       'apparel retail', 'apparel/eyewear', 'appliances',
       'architectural products', 'armaments', 'art dealing',
       'asset management', 'auto components', 'auto dealerships',
       'auto engines', 'auto glass', 'auto parts', 'auto repair',
       'auto retail', 'auto sales, energy', 'automobiles', 'autos',
       'aviation', 'banannas', 'banking', 'banking/insurance',
       'banking/media', 'banking/oil/aluminum', 'banking/publishing',
       'banking/real estate', 'beauty and health care products', 'beer',
       'beer and food distribution', 'beverages', 'beverages and food',
       'bicy

In [370]:

df['company.sector'] = df['company.sector'].replace(
    {
        'Banking': 'banking',
        'Communications': 'communications',
        'Fashion': 'fashion',
        'Finance': 'finance',
        'Oil refining': 'oil refining',
        'Software': 'software',
        'agriculture': 'agricultural',
        'agriculteral': 'agricultural',
        'aigriculture': 'agricultural',
        'airports': 'airport',
        'aluminum, oil': 'aluminum/oil',
        'aplliances': 'appliances',
        'automobile dealers': 'auto dealerships',
        'automotive parts': 'auto parts',
        'automotive components': 'auto components',
        'automotive': 'autos',
        'automotives': 'autos',
        'bank': 'banking',
        'banking and insurance': 'banking/insurance',
        'banking, real estate': 'banking/real estate',
        'banking and real estate': 'banking/real estate',
        'banking, media': 'banking/media',
        'banking, oil, aluminum': 'banking/oil/aluminum',
        'braking systems': 'brake systems',
        'cars, finance': 'cars/finance',
        'cigarette': 'cigarettes',
        'comodities': 'commodities',
        'commodity trading': 'commodities trading',
        'construcion': 'construction',
        'construction, hotels, gaming': 'construction/hotels/gaming',
        'construction, utilities': 'construction/utilities',
        'construction,banking': 'construction/banking',
        'copper, poultry': 'copper/poultry',
        'e-commerce, venture capital': 'e-commerce/venture capital',
        'education, language schools': 'education/language schools',
        'electonics': 'electronics',
        'electonics components': 'electronic components',
        'energy, retail, manufacturing': 'energy/retail/manufacturing',
        'property, textile manufacturing': 'property/textile manufacturing',
        'eyeglasses': 'glasses',
        'fertalizers': 'fertilizers',
        'fertilizer': 'fertilizers',
        'finace': 'finance',
        'forestry and mining': 'forestry/mining',
        'metals and mining': 'metals/mining',
        'gambing': 'gambling',
        'media, pipelines': 'media/pipelines',
        'gas, petrochemicals': 'gas/petrochemicals',
        'internet companies': 'internet company',
        'internet provider': 'internet service provider',
        'investments': 'investment',
        'invetsments': 'investment',
        'investments/real estate, sports teams': 'investments/real estate/sports teams',
        'iron and steel': 'iron/steel',
        'leveraged buyout': 'leveraged buyouts',
        'lumber, real estate': 'lumber/real estate',
        'manufacturing, real estate, aviation': 'manufacturing/real estate/aviation',
        'mining, investments': 'mining/investments',
        'mining, steel': 'mining/steel',
        'mutal funds': 'mutual funds',
        'oil and gas': 'oil/gas',
        'oil and natural gas': 'oil/gas',
        'oil, railroad, investments': 'oil/railroad/investments',
        'oil field services': 'oilfield services',
        'petrochemicals and textiles': 'petrochemicals/textiles',
        'petrochemicals, textiles': 'petrochemicals/textiles',
        'plastic': 'plastics',
        'ports, gas': 'ports/gas',
        'postal service, beverages': 'postal service/beverages',
        'real estate and metals': 'real estate/metals',
        'real estate, financial services and computers': 'real estate/financial services/computers',
        'real estate, retail': 'real estate/retail',
        'restaurant': 'restaurants',
        'retail, financial services, real estate': 'retail/financial services/real estate',
        'retail, home appliances': 'retail/home appliances',
        'rice trading, mining, infrastructure': 'rice trading/mining/infrastructure',
        'self storage': 'self-storage',
        'stock brokerage': 'stock broker',
        'sugar, flour, cement': 'sugar/flour/cement',
        'telecom, finance': 'telecom/finance',
        'telecom, oil, beer': 'telecom/oil/beer',
        'telecomm': 'telecom',
        'textiles, industrial goods, media': 'textiles/industrial goods/media',
        'timber and paper': 'timber/paper',
        'timber, plantations': 'timber/plantations',
        'trading company': 'trading',
        'truck stop': 'truck stops',
        'trucking and logistics': 'trucking/logistics',
        'apparel and eyewear': 'apparel/eyewear',
        'brokerage/banking': 'broker/banking',
        'chocolate and coffee': 'chocolates/coffee',
        'electronic componants': 'electronic components',
        'elevators, escalators, engineering': 'elevators/escalators/engineering',
        'food and energy processess': 'food and energy processes',
        'food, entertainment': 'food/entertainment',
        'gas and propane': 'gas/propane',
        'hospitality and food service': 'hospitality/food service',
        'hotels and supermarkets': 'hotels/supermarkets',
        'industrial manufacturing and banking': 'industrial manufacturing/banking',
        'insurancei, power': 'insurance/power',
        'media, exports': 'media/exports',
        'metals, paper, cement': 'metals/paper/cement',
        'mining, banking': 'mining/banking',
        'oleochemicals and palm oil': 'oleochemicals/palm oil',
        'palm oil, mining': 'palm oil/mining',
        'pet supplies, real estate': 'pet supplies/real estate',
        'printing and electronics': 'printing/electronics'
    }
)

### company.relationship

In [380]:
df['company.relationship'].sort_values().unique()

array(['CEO', 'COO', 'Chairman', 'Chairman and Chief Executive Officer',
       'Chairman, CEO', 'Chairman/founder', 'Chairman/shareholder',
       'Chief Executive', 'Exectuitve Director',
       'Global Head of Real Estate', 'Head of Board of Directors',
       'Honorary President for Life', 'Relation', 'Vice Chairman',
       'Vice President', 'Vice President of Infrastructure Software',
       'ceo', 'chairman', 'chairman and ceo',
       'chairman of management committee', 'chairman of the board',
       'chairwomen', 'chariman', 'co-chairman',
       'co-director of zinc, copper and lead', 'deputy chairman',
       'director', 'employee', 'executive chairman', 'former CEO',
       'former chairman and CEO', 'founder', 'founder CEO owner',
       'founder and CEO', 'founder and ceo', 'founder and chairman',
       'founder and chairwoman', 'founder and executive chairman',
       'founder and executive vice chairman', 'founder, chairman',
       'founder, chairman, ceo', 'founder,

In [381]:
similar(df["company.relationship"].sort_values().unique())

CEO 
 ------------------------------
COO 
 ------------------------------
Chairman 
 ------------------------------
['Chairman, CEO', 'Vice Chairman', 'chairman', 'chariman', 'co-chairman'] 


Chairman and Chief Executive Officer 
 ------------------------------
Chairman, CEO 
 ------------------------------
Chairman/founder 
 ------------------------------
['Chairman/shareholder'] 


Chairman/shareholder 
 ------------------------------
['shareholder'] 


Chief Executive 
 ------------------------------
Exectuitve Director 
 ------------------------------
Global Head of Real Estate 
 ------------------------------
Head of Board of Directors 
 ------------------------------
Honorary President for Life 
 ------------------------------
Relation 
 ------------------------------
['relation'] 


Vice Chairman 
 ------------------------------
['chairman', 'vice chairman', 'vice-chairman'] 


Vice President 
 ------------------------------
Vice President of Infrastructure Software 
 ---------

### name

Listing possible inconsistancies where the names are different, but the following features are the same:
<ul>
    <li>company.founded</li>
    <li>company.name</li>
    <li>company.relationship</li>
    <li>company.sector</li>
    <li>company.type</li>
    <li>demographics.gender</li>
    <li>location.citizenship</li>
    <li>location.country code</li>
    <li>location.region</li>
    <li>wealth.type</li>
    <li>wealth.how.category</li>
    <li>wealth.how.from emerging</li>
    <li>wealth.how.industry</li>
    <li>wealth.how.inherited</li>
</ul>

In [64]:
dup_cols = ['company.founded',
            'company.name',
            'company.relationship',
            'company.sector',
            'company.type',
            'demographics.gender',
            'location.citizenship',
            'location.country code',
            'location.region',
            'wealth.type',
            'wealth.how.category',
            'wealth.how.from emerging',
            'wealth.how.industry',
            'wealth.how.inherited']

dup_rows = df[df.duplicated(subset=dup_cols)]
dup_rows[duprows[]]

SyntaxError: invalid syntax (1649346416.py, line 17)

In [None]:
import re

df[df['name'].apply(lambda x: (re.search('abraham', x, re.IGNORECASE) is not None))]

In [None]:
import re

df[df['name'].apply(lambda x: (re.search('abraham', x, re.IGNORECASE) is not None))]