In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [37]:
df = pd.read_csv('/kaggle/input/billioners-dataset/billionaires.csv')

pd.set_option('display.max_columns', None)
df

Unnamed: 0,name,rank,year,company.founded,company.name,company.relationship,company.sector,company.type,demographics.age,demographics.gender,location.citizenship,location.country code,location.gdp,location.region,wealth.type,wealth.worth in billions,wealth.how.category,wealth.how.from emerging,wealth.how.industry,wealth.how.inherited,wealth.how.was founder,wealth.how.was political
0,Bill Gates,1,1996,1975,Microsoft,founder,Software,new,40,male,United States,USA,8.100000e+12,North America,founder non-finance,18.5,New Sectors,True,Technology-Computer,not inherited,True,True
1,Bill Gates,1,2001,1975,Microsoft,founder,Software,new,45,male,United States,USA,1.060000e+13,North America,founder non-finance,58.7,New Sectors,True,Technology-Computer,not inherited,True,True
2,Bill Gates,1,2014,1975,Microsoft,founder,Software,new,58,male,United States,USA,0.000000e+00,North America,founder non-finance,76.0,New Sectors,True,Technology-Computer,not inherited,True,True
3,Warren Buffett,2,1996,1962,Berkshire Hathaway,founder,Finance,new,65,male,United States,USA,8.100000e+12,North America,founder non-finance,15.0,Traded Sectors,True,Consumer,not inherited,True,True
4,Warren Buffett,2,2001,1962,Berkshire Hathaway,founder,Finance,new,70,male,United States,USA,1.060000e+13,North America,founder non-finance,32.3,Traded Sectors,True,Consumer,not inherited,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2609,Wu Chung-Yi,1565,2014,1991,Tingyi,investor,beverages and food,new,55,male,Taiwan,Taiwan,0.000000e+00,East Asia,executive,1.0,Traded Sectors,True,Non-consumer industrial,not inherited,True,True
2610,Wu Xiong,1565,2014,1999,Biostime International Holdings,owner,infant formula,new,0,male,China,CHN,0.000000e+00,East Asia,executive,1.0,Traded Sectors,True,Consumer,not inherited,True,True
2611,Yang Keng,1565,2014,0,Blue Ray Corp,chairman,real estate,new,53,male,China,CHN,0.000000e+00,East Asia,self-made finance,1.0,Financial,True,Real Estate,not inherited,True,True
2612,Zdenek Bakala,1565,2014,1994,Patria Finance,founder,coal,new,53,male,Czech Republic,CZE,0.000000e+00,Europe,privatized and resources,1.0,Resource Related,True,Mining and metals,not inherited,True,True


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2614 entries, 0 to 2613
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      2614 non-null   object 
 1   rank                      2614 non-null   int64  
 2   year                      2614 non-null   int64  
 3   company.founded           2614 non-null   int64  
 4   company.name              2576 non-null   object 
 5   company.relationship      2568 non-null   object 
 6   company.sector            2591 non-null   object 
 7   company.type              2578 non-null   object 
 8   demographics.age          2614 non-null   int64  
 9   demographics.gender       2580 non-null   object 
 10  location.citizenship      2614 non-null   object 
 11  location.country code     2614 non-null   object 
 12  location.gdp              2614 non-null   float64
 13  location.region           2614 non-null   object 
 14  wealth.t

# Preprocessing
This includes data cleanse, filling missing data, and removing duplicates.

## Cleaning up the name feature

### Todo:
<ul>
    <li>Remove trailing and leading spaces from names</li>
    <li>Remove all extra spaces in the middle of the names</li>
    <li>Remove all extra dots ...</li>
    <li>Make certain words consistent: junior, jr, senior, sr, jr., sr.</li>
</ul>

In [39]:
string_cat_cols = ['name', 
                   'company.name',
                   'company.relationship',
                   'company.sector',
                   'company.type', 
                   'demographics.gender',
                   'location.citizenship',
                   'location.country code', 
                   'location.region',
                   'wealth.type',
                   'wealth.how.category',
                   'wealth.how.industry',
                   'wealth.how.inherited'
                  ]

for col in string_cat_cols:
    df[col] = df[col].apply(lambda x: str(x).strip().replace('..', '.').replace('  ', ' '))

### 'company.type'

In [40]:
df['company.type'].value_counts().reset_index().sort_values(by="index")

Unnamed: 0,index,company.type
5,acquired,7
1,aquired,196
9,franchise,2
14,franchise rights,1
15,joint venture,1
8,merger,3
3,,36
13,neew,1
0,new,2302
11,new division,1


In [41]:
def reform_company_type(x):
    return str(x).replace('aquired', 'acquired') \
                 .replace('franchise rights', 'franchise') \
                 .replace('neew', 'new') \
                 .replace('new division', 'new') \
                 .replace('privatized', 'privatization') \
                 .replace('new, privitization', 'new/privatization') \
                 .replace('nan', 'unspecified')

df['company.type'] = df['company.type'].apply(lambda x: reform_company_type(x))

In [42]:
df['company.type'].isna().sum()

0

### 'demographics.gender'

In [46]:
df['demographics.gender'].value_counts()

male              2328
female             249
unspecified         34
married couple       3
Name: demographics.gender, dtype: int64

In [44]:
df['demographics.gender'] = df['demographics.gender'].apply(lambda x: str(x).replace('nan', 'unspecified'))

In [45]:
df['demographics.gender'].isna().sum()

0

### 'location.citizenship'

In [51]:
citizenships = df['location.citizenship'].unique()
citizenships.sort()
citizenships

array(['Algeria', 'Angola', 'Argentina', 'Australia', 'Austria',
       'Bahrain', 'Belgium', 'Bermuda', 'Brazil', 'Canada', 'Chile',
       'China', 'Colombia', 'Cyprus', 'Czech Republic', 'Denmark',
       'Ecuador', 'Egypt', 'Finland', 'France', 'Georgia', 'Germany',
       'Greece', 'Guernsey', 'Hong Kong', 'India', 'Indonesia', 'Ireland',
       'Israel', 'Italy', 'Japan', 'Kazakhstan', 'Kuwait', 'Lebanon',
       'Liechtenstein', 'Lithuania', 'Macau', 'Malaysia', 'Mexico',
       'Monaco', 'Morocco', 'Nepal', 'Netherlands', 'New Zealand',
       'Nigeria', 'Norway', 'Oman', 'Peru', 'Philippines', 'Poland',
       'Portugal', 'Romania', 'Russia', 'Saudi Arabia', 'Singapore',
       'South Africa', 'South Korea', 'Spain', 'St. Kitts and Nevis',
       'Swaziland', 'Sweden', 'Switzerland', 'Taiwan', 'Tanzania',
       'Thailand', 'Turkey', 'Uganda', 'Ukraine', 'United Arab Emirates',
       'United Kingdom', 'United States', 'Venezuela', 'Vietnam'],
      dtype=object)

In [52]:
df['location.citizenship'].isna().sum()

0

### 'company.relationship'

In [65]:
list(df['company.relationship'].unique())

['founder',
 'nan',
 'founder, chairman',
 'relation',
 'ceo',
 'chairman',
 'investor',
 'founder and ceo',
 'founder, ceo',
 'owner',
 'chairman of management committee',
 'founder and chairman',
 'chairman and chief executive officer',
 'general director',
 'executive chairman',
 'chairman, founder',
 'founder, chairman, ceo',
 'former chairman and ceo',
 'relation and chairman',
 'investor, founder',
 'partner',
 'president',
 'investor and ceo',
 'founder ceo owner',
 'chairman, shareholder',
 'vice president',
 'chairman of the board',
 'founder, relation',
 'founder, vice chairman',
 'honorary president for life',
 'former ceo',
 'inherited',
 'vice-chairman',
 'owner and vice chair',
 'founder, president',
 'co-chairman',
 'lawer',
 'relation, vice chairman',
 'chairman, ceo',
 'employee',
 "head of microsoft's application software group",
 'chariman',
 'exectuitve director',
 'vice chairman',
 'shareholder',
 'relation and ceo',
 'founder, chairwoman, ceo',
 'founder and execu

In [63]:
df['company.relationship'] = df['company.relationship'].apply(
    lambda x: str(x).lower().replace('/', ', ')
)

### 'name' feature

Listing possible inconsistancies where the names are different, but the following features are the same:
<ul>
    <li>company.founded</li>
    <li>company.name</li>
    <li>company.relationship</li>
    <li>company.sector</li>
    <li>company.type</li>
    <li>demographics.gender</li>
    <li>location.citizenship</li>
    <li>location.country code</li>
    <li>location.region</li>
    <li>wealth.type</li>
    <li>wealth.how.category</li>
    <li>wealth.how.from emerging</li>
    <li>wealth.how.industry</li>
    <li>wealth.how.inherited</li>
</ul>

In [64]:
dup_cols = ['company.founded',
            'company.name',
            'company.relationship',
            'company.sector',
            'company.type',
            'demographics.gender',
            'location.citizenship',
            'location.country code',
            'location.region',
            'wealth.type',
            'wealth.how.category',
            'wealth.how.from emerging',
            'wealth.how.industry',
            'wealth.how.inherited']

dup_rows = df[df.duplicated(subset=dup_cols)]
dup_rows[duprows[]]

SyntaxError: invalid syntax (1649346416.py, line 17)

In [None]:
import re

df[df['name'].apply(lambda x: (re.search('abraham', x, re.IGNORECASE) is not None))]

In [None]:
import re

df[df['name'].apply(lambda x: (re.search('abraham', x, re.IGNORECASE) is not None))]