In [1322]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [1323]:
# Defining some text formatting constants:

# Start making the text bold
F_BOLD = '\033[1m'

# Start underlining the text
F_UNDERLINE = '\033[4m'

# End text formatting
F_END = '\033[0m'

# Divider line beneath text
F_DIV = '─'*60

# Loading the dataset

In [1324]:
df = pd.read_csv('./billionaires.csv')

In [1325]:
print((f"\nThe dataset consists of {F_BOLD}%d instances{F_END}" +
       f" and {F_BOLD}%d features{F_END}:\n{F_DIV}\n")
      %df.shape)

# Setting this option enables the presentation of all features instead of
# omitting the middle ones:
from IPython.display import display
with pd.option_context('display.max_columns', None):
    display(df.sample(5)) # Retrieves random instances from the dataset


The dataset consists of [1m2614 instances[0m and [1m22 features[0m:
────────────────────────────────────────────────────────────



Unnamed: 0,name,rank,year,company.founded,company.name,company.relationship,company.sector,company.type,demographics.age,demographics.gender,location.citizenship,location.country code,location.gdp,location.region,wealth.type,wealth.worth in billions,wealth.how.category,wealth.how.from emerging,wealth.how.industry,wealth.how.inherited,wealth.how.was founder,wealth.how.was political
1271,Dirk Ziff,421,2001,1928,Ziff Davis Inc,relation,publishing,new,36,male,United States,USA,10600000000000.0,North America,inherited,1.2,Financial,True,Hedge funds,3rd generation,True,True
1364,Thomas Flatley,452,2001,1958,The Flatley Co,founder,real estate,new,68,male,United States,USA,10600000000000.0,North America,self-made finance,1.1,Financial,True,Real Estate,not inherited,True,True
520,R Budi Hartono,173,2014,1951,Djarum,relation,tobacco,new,73,male,Indonesia,IDN,0.0,East Asia,self-made finance,7.6,Financial,True,Money Management,not inherited,True,True
1141,Lars Larsen,375,2014,1979,JYSK,founder and CEO,retail,new,65,male,Denmark,DEN,0.0,Europe,founder non-finance,4.0,Non-Traded Sectors,True,"Retail, Restaurant",not inherited,True,True
103,Charles Ergen,35,2001,1980,dish networtk,founder,media,new,48,male,United States,USA,10600000000000.0,North America,founder non-finance,8.8,Non-Traded Sectors,True,Media,not inherited,True,True


## Features and their initial types

In [1326]:
print(f"The {F_BOLD}string{F_END} features include:\n{F_DIV}\n" +
      f"{df.select_dtypes(include=['object']).dtypes}\n")

print(f"The {F_BOLD}numerical{F_END} features include:\n{F_DIV}\n" +
      f"{df.select_dtypes(include=['int64', 'float64']).dtypes}\n")

print(f"There are also several {F_BOLD}boolean{F_END} features, including:\n" +
      f"{F_DIV}\n{df.select_dtypes(include=['bool']).dtypes}\n")

The [1mstring[0m features include:
────────────────────────────────────────────────────────────
name                     object
company.name             object
company.relationship     object
company.sector           object
company.type             object
demographics.gender      object
location.citizenship     object
location.country code    object
location.region          object
wealth.type              object
wealth.how.category      object
wealth.how.industry      object
wealth.how.inherited     object
dtype: object

The [1mnumerical[0m features include:
────────────────────────────────────────────────────────────
rank                          int64
year                          int64
company.founded               int64
demographics.age              int64
location.gdp                float64
wealth.worth in billions    float64
dtype: object

There are also several [1mboolean[0m features, including:
────────────────────────────────────────────────────────────
wealth.how.from em

# Data cleaning up
This includes all data cleanse measures, including:
<ul>
    <li>Identification and correction of spelling errors</li>
    <li>Making common categories (values in categorical features) consistent</li>
    <li>Identifying the missing data (sometimes denoted by value '0')</li>
    <li>Inspecting outliers</li>
    <li>Casting features to the suitable types according to the data that they contain</li>
</ul>

## Processing the Boolean features
All of these features contain only a single value (True), and as a result, they do not mandate any data cleanse tasks.

### wealth.how.from emerging

In [1327]:
df['wealth.how.from emerging'].value_counts(dropna=False)

True    2614
Name: wealth.how.from emerging, dtype: int64

### wealth.how.was political

In [1328]:
df['wealth.how.was political'].value_counts(dropna=False)

True    2614
Name: wealth.how.was political, dtype: int64

### wealth.how.was founder

In [1329]:
df['wealth.how.was founder'].value_counts(dropna=False)

True    2614
Name: wealth.how.was founder, dtype: int64

## Processing the categorical features

### Common pre-processings
These features generally need the following <b>pre-processings</b>:
<ul>
    <li>Removing trailing and leading spaces from names</li>
    <li>Removing all consecutive spaces in the middle of the names</li>
    <li>Removing all extra dots</li>
    <li>Capitalizing the first letter of each word</li>
    <li>Transforming 'and' and '/' to ','
    <li>Correcting the spelling errors and typos</li>
</ul>

In [164]:
# The piece of code below performs all mentioned refactorings except for
# Spelling errors correction which needs to be done case by case

for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].apply(
        lambda x: ' '.join(
            x.replace('..', '.')
             .title()
             .replace(' And ', ', ')
             .replace('/', ', ')
             .split())
            if isinstance(x, str) else x
    )

### Checking and correcting spelling errors

In [1331]:
# The following function receives a lists of words and maps the similar ones
# together; it is used to clean up categorical features by pinpointing the
# 'almost' similar words and consequently helping in the identification of the
# spelling errors.

from difflib import SequenceMatcher

def similar(word_list, min_similarity = .9):
    for i in range(len(word_list)):
        first_word = word_list[i]
        if pd.isna(first_word):
            continue
        res = set()
        for j in range(i+1, len(word_list)):
            second_word = word_list[j]
            if pd.isna(second_word):
                continue
            similarity = SequenceMatcher(None, first_word, second_word).ratio()
            if min_similarity <= similarity < 1:
                res.add(second_word)
        if len(res) > 0:
            print(first_word, '\n', '-'*30)
            print(res, '\n\n')

#### wealth.how.inherited

In [1332]:
df['wealth.how.inherited'].value_counts(dropna=False)

Not Inherited               1688
Father                       558
3Rd Generation               210
4Th Generation                68
Spouse, Widow                 59
5Th Generation Or Longer      31
Name: wealth.how.inherited, dtype: int64

#### wealth.how.industry

In [1333]:
df['wealth.how.industry'].value_counts(dropna=False)

Consumer                            471
Retail, Restaurant                  281
Real Estate                         280
Money Management                    249
Media                               219
Technology-Computer                 208
Diversified Financial               167
Energy                              132
Technology-Medical                  111
Non-Consumer Industrial             107
Constrution                          97
Mining, Metals                       90
Other                                83
Hedge Funds                          67
Private Equity, Leveraged Buyout     25
0                                    16
Venture Capital                       8
Banking                               1
Services                              1
NaN                                   1
Name: wealth.how.industry, dtype: int64

#### wealth.how.category

In [1334]:
df['wealth.how.category'].value_counts(dropna=False)

Financial             800
Non-Traded Sectors    597
Traded Sectors        564
New Sectors           319
Resource Related      245
0                      85
NaN                     1
Finance                 1
Trucking                1
Energy                  1
Name: wealth.how.category, dtype: int64

#### wealth.type

In [1335]:
df['wealth.type'].value_counts(dropna=False)

Inherited                953
Founder Non-Finance      713
Self-Made Finance        500
Privatized, Resources    236
Executive                190
NaN                       22
Name: wealth.type, dtype: int64

#### location.region

In [1336]:
df['location.region'].value_counts(dropna=False)

North America                992
Europe                       698
East Asia                    535
Latin America                182
Middle East, North Africa    117
South Asia                    69
Sub-Saharan Africa            20
0                              1
Name: location.region, dtype: int64

#### location.country code

In [1337]:
df['location.country code'].sort_values().unique()

array(['Ago', 'Are', 'Arg', 'Aus', 'Aut', 'Bel', 'Bhr', 'Bmu', 'Bra',
       'Can', 'Che', 'Chl', 'Chn', 'Col', 'Cyp', 'Cze', 'Den', 'Deu',
       'Dnk', 'Dza', 'Ecu', 'Egy', 'Esp', 'Fin', 'Fra', 'Gbr', 'Geo',
       'Ggy', 'Grc', 'Hkg', 'Idn', 'Ind', 'Irl', 'Isr', 'Ita', 'Jpn',
       'Kaz', 'Kna', 'Kor', 'Kwt', 'Lbn', 'Lie', 'Ltu', 'Mac', 'Mar',
       'Mco', 'Mex', 'Mys', 'Nga', 'Nld', 'Nor', 'Npl', 'Nzl', 'Omn',
       'Per', 'Phl', 'Pol', 'Prt', 'Rou', 'Rus', 'Sau', 'Sgp', 'Swe',
       'Swz', 'Taiwan', 'Tha', 'Tur', 'Tza', 'Uga', 'Ukr', 'Usa', 'Ven',
       'Vnm', 'Zaf'], dtype=object)

#### location.citizenship

In [1338]:
df['location.citizenship'].value_counts(dropna=False)

United States    903
Germany          160
China            153
Russia           119
Japan             96
                ... 
Bahrain            1
Ecuador            1
Georgia            1
Bermuda            1
Tanzania           1
Name: location.citizenship, Length: 73, dtype: int64

#### demographics.gender

In [1339]:
df['demographics.gender'].value_counts(dropna=False)

Male              2328
Female             249
NaN                 34
Married Couple       3
Name: demographics.gender, dtype: int64

#### company.type

In [1340]:
df['company.type'].value_counts(dropna=False)

New                       2302
Aquired                    196
Privatization               42
NaN                         36
Subsidiary                   9
Acquired                     7
State Owned Enterprise       7
New, Aquired                 3
Merger                       3
Franchise                    2
New, Privitization           2
New Division                 1
Privatized                   1
Neew                         1
Franchise Rights             1
Joint Venture                1
Name: company.type, dtype: int64

In [1341]:
df['company.type'] = df['company.type'].replace(
    {
        'Aquired': 'Acquired',
        'Neew': 'New',
        'New, Privitization': 'New, Privatization',
    }
)

#### company.sector

In [1342]:
df['company.sector'].sort_values().unique()

array(['Advertising', 'Aerospace, Defense', 'Agribusiness',
       'Agriculteral', 'Agricultural Products', 'Agriculture',
       'Aigriculture', 'Air Compressors', 'Aircraft Leasing', 'Airline',
       'Airplanes', 'Airport', 'Airport Maintenance', 'Airports',
       'Alcohol', 'Aluminum', 'Aluminum, Oil', 'Animal Feed', 'Animation',
       'Antennas', 'Aplliances', 'Apparel', 'Apparel Retail',
       'Apparel, Eyewear', 'Appliances', 'Architectural Products',
       'Armaments', 'Art Dealing', 'Asset Management', 'Auto Components',
       'Auto Dealerships', 'Auto Engines', 'Auto Glass', 'Auto Parts',
       'Auto Repair', 'Auto Retail', 'Auto Sales, Energy',
       'Automobile Dealers', 'Automobiles', 'Automotive',
       'Automotive Components', 'Automotive Parts', 'Automotives',
       'Autos', 'Aviation', 'Banannas', 'Bank', 'Banking',
       'Banking, Insurance', 'Banking, Media', 'Banking, Oil, Aluminum',
       'Banking, Publishing', 'Banking, Real Estate',
       'Beauty, Hea

In [1343]:
similar(df["company.sector"].sort_values().unique())

Agriculture 
 ------------------------------
{'Aigriculture'} 


Airport 
 ------------------------------
{'Airports'} 


Aplliances 
 ------------------------------
{'Appliances'} 


Automotive 
 ------------------------------
{'Automotives'} 


Cigarette 
 ------------------------------
{'Cigarettes'} 


Commodities 
 ------------------------------
{'Comodities'} 


Construcion 
 ------------------------------
{'Construction'} 


Electonics 
 ------------------------------
{'Electronics'} 


Electonics Components 
 ------------------------------
{'Electronic Componants'} 


Fertalizers 
 ------------------------------
{'Fertilizers'} 


Fertilizer 
 ------------------------------
{'Fertilizers'} 


Finace 
 ------------------------------
{'Finance'} 


Gambing 
 ------------------------------
{'Gaming'} 


Investment 
 ------------------------------
{'Investments'} 


Investments 
 ------------------------------
{'Invetsments'} 


Leveraged Buyout 
 ------------------------------
{'L

In [1344]:
df['company.sector'] = df['company.sector'].replace(
    {
        'Agriculteral': 'Agricultural',
        'Aigriculture': 'Agriculture',
        'Aplliances': 'Appliances',
        'Comodities': 'Commodities',
        'Construcion': 'Construction',
        'Electonics': 'Electronics',
        'Electonics Components': 'Electronic Components',
        'Fertalizers': 'Fertilizers',
        'Finace': 'Finance',
        'Gambing': 'Gambling',
        'Invetsments': 'Investments',
        'Mutal Funds': 'Mutual Funds',
        'Telecomm': 'Telecom',
        'Electronic Componants': 'Electronic Components',
        'Food And Energy Processess': 'Food And Energy Processes',
        'Insurancei, Power': 'Insurance, Power',
    }
)

#### company.relationship

In [1345]:
df['company.relationship'].sort_values().unique()

array(['Ceo', 'Chairman', 'Chairman Of Management Committee',
       'Chairman Of The Board', 'Chairman, Ceo',
       'Chairman, Chief Executive Officer', 'Chairman, Founder',
       'Chairman, Shareholder', 'Chairwomen', 'Chariman',
       'Chief Executive', 'Co-Chairman',
       'Co-Director Of Zinc, Copper, Lead', 'Coo', 'Deputy Chairman',
       'Director', 'Employee', 'Exectuitve Director',
       'Executive Chairman', 'Former Ceo', 'Former Chairman, Ceo',
       'Founder', 'Founder Ceo Owner', 'Founder, Ceo',
       'Founder, Chairman', 'Founder, Chairman, Ceo',
       'Founder, Chairwoman', 'Founder, Chairwoman, Ceo',
       'Founder, Executive Chairman', 'Founder, Executive Vice Chairman',
       'Founder, President', 'Founder, Relation',
       'Founder, Vice Chairman', 'General Director',
       'Global Head Of Real Estate', 'Head Of Board Of Directors',
       'Head Of High-Yield Bond Trading Dept',
       "Head Of Microsoft'S Application Software Group",
       'Honorary Pr

In [1346]:
similar(df["company.relationship"].sort_values().unique())

Founder, Chairman 
 ------------------------------
{'Founder, Chairwoman'} 


Founder, Chairman, Ceo 
 ------------------------------
{'Founder, Chairwoman, Ceo'} 


Founder, Executive Chairman 
 ------------------------------
{'Founder, Executive Vice Chairman'} 


Lawer 
 ------------------------------
{'Lawyer'} 


Vice Chairman 
 ------------------------------
{'Vice-Chairman'} 




In [1347]:
df['company.relationship'] = df['company.relationship'].replace(
    {
        'Chariman': 'Chairman',
        'Lawer': 'Lawyer',
        'Vice-Chairman': 'Vice Chairman',
    }
)

### Common post-processings
The features require the following <b>post-processings</b>:
<ul>
    <li>Replacing the value '0' with Null</li>
    <li>Casting the variable from string (object) to categorical</li>
</ul>

In [1348]:
# The piece of code below performs all mentioned post-processings for all the
# string features except for 'name' which is not supposed to be transformed
# to categorical


for col in df.select_dtypes(include=['object']).columns:
    if col == 'name':
        continue
    
    # Replacing the value of '0' with Null
    df[col] = df[col].replace(
        {
            '0': None,
        }
    )

    # Casting the feature into categorical type
    df[col] = df[col].astype('category')

### Cleaning the 'name' feature
This task includes:
<ul>
    <li>Converting every 'Jr' and 'Sr' to 'Jr.' and 'Sr.'</li>
    <li>Converting every [initial] to [initial + '.']s</li>
    <li>Converting every 'And' to ','</li>
    <li>In some names 'Jr' and 'Sr' are separated by ',' and in some they are not; they should be unified</li>
</ul>

In [1349]:
import re

df['name'] = (df['name'].replace(to_replace=r'\bSr$', regex=True, value='Sr.')
                        .replace(to_replace=r'\bJr$', regex=True, value='Jr.')
                        .replace(to_replace=r'\b\w\b(?!\.)', regex=True, value=r'\g<0>.')
                        .replace(to_replace=r', Jr\.$', regex=True, value=' Jr.')
                        .replace(to_replace=r', Sr\.$', regex=True, value=' Sr.')
             )

In [1350]:
# similar(df['name'])

## Processing the numerical features

In [1351]:
# rank                          int64
# year                          int64
# company.founded               int64
# demographics.age              int64
# location.gdp                float64
# wealth.worth in billions    float64



In [1352]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

num_cols

Index(['rank', 'year', 'company.founded', 'demographics.age', 'location.gdp',
       'wealth.worth in billions'],
      dtype='object')

### Missing values
None of the numberic features by definition can contain 0; hence, all 0s must be transformed to Null

In [1354]:
total_na = df[num_cols].isna().sum().sum()

print(f"Numerical columns contain a total of {total_na} NA values")

Numerical columns contain a total of 0 NA values


In [1355]:
df[num_cols] = df[num_cols].replace(0, np.nan)

### Identifying outliers

In [1358]:
df[num_cols].describe()

Unnamed: 0,rank,year,company.founded,demographics.age,location.gdp,wealth.worth in billions
count,2614.0,2614.0,2574.0,2231.0,949.0,2614.0
mean,599.672533,2008.41163,1954.621989,62.498431,4872956000000.0,3.531943
std,467.885695,7.483598,43.189003,13.39533,4420257000000.0,5.088813
min,1.0,1996.0,1610.0,-42.0,2491801000.0,1.0
25%,215.0,2001.0,1938.25,53.0,397000000000.0,1.4
50%,430.0,2014.0,1963.0,62.0,2500000000000.0,2.0
75%,988.0,2014.0,1985.0,72.0,10600000000000.0,3.5
max,1565.0,2014.0,2012.0,98.0,10600000000000.0,76.0


In [1357]:
# The following function recerves a columns name and returns the index and
# values of all the instances outside the boundary of 2 standard deviations
# away from the mean.
def get_outliers(col):
    return col[np.abs(col-col.mean()) > (1*col.std())]

cols = [
    'company.founded',
    'demographics.age',
]
for col in cols:
    outliers = get_outliers(df[col])
    outliers_count = outliers.shape[0]
    print(f"The {F_BOLD}{col}{F_END} feature contains" +
          f" {outliers_count} outliers.")
    if outliers_count > 0:
        print(f"The outliers include:\n{F_DIV}\n{outliers.sort_values()}")
    print("\n")

The [1mcompany.founded[0m feature contains 512 outliers.
The outliers include:
────────────────────────────────────────────────────────────
1081    1610.0
1454    1610.0
2058    1615.0
1092    1615.0
788     1615.0
         ...  
1510    2009.0
2542    2010.0
1142    2010.0
2243    2010.0
1913    2012.0
Name: company.founded, Length: 512, dtype: float64


The [1mdemographics.age[0m feature contains 763 outliers.
The outliers include:
────────────────────────────────────────────────────────────
748    -42.0
1065    -7.0
788     12.0
1262    21.0
1263    24.0
        ... 
1663    95.0
2127    95.0
993     96.0
924     96.0
1539    98.0
Name: demographics.age, Length: 763, dtype: float64




#### demographics.age

In [1281]:
df.loc[[748, 1065]]

Unnamed: 0,name,rank,year,company.founded,company.name,company.relationship,company.sector,company.type,demographics.age,demographics.gender,...,location.gdp,location.region,wealth.type,wealth.worth in billions,wealth.how.category,wealth.how.from emerging,wealth.how.industry,wealth.how.inherited,wealth.how.was founder,wealth.how.was political
748,Kunio Busujima,249,1996,1950.0,Sankyo,Founder,Pachinko Gaming Machines,New,-42.0,Male,...,4710000000000.0,East Asia,Founder Non-Finance,1.6,Non-Traded Sectors,True,"Retail, Restaurant",Not Inherited,True,True
1065,Kenichi Mabuchi,354,1996,1946.0,Mabuchi Motor Company,Founder,Electric Motors,New,-7.0,Male,...,4710000000000.0,East Asia,Founder Non-Finance,1.1,Traded Sectors,True,Non-Consumer Industrial,Not Inherited,True,True


In [1282]:
df.loc[748, ['demographics.age']] = 71
df.loc[1065, ['demographics.age']] = 63