In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re # The RegEx module used for string operations

In [88]:
# Defining some text formatting constants:

# Start making the text bold
F_BOLD = '\033[1m'

# Start underlining the text
F_UNDERLINE = '\033[4m'

# End text formatting
F_END = '\033[0m'

# Divider line beneath text
F_DIV = '─'*60

# Loading the dataset

In [89]:
df = pd.read_csv('./billionaires.csv')

In [90]:
print((f"\nThe dataset consists of {F_BOLD}%d instances{F_END}" +
       f" and {F_BOLD}%d features{F_END}:\n{F_DIV}\n")
      %df.shape)

# Setting this option enables the presentation of all features instead of
# omitting the middle ones:
from IPython.display import display
with pd.option_context('display.max_columns', None):
    display(df.sample(5)) # Retrieves random instances from the dataset


The dataset consists of [1m2614 instances[0m and [1m22 features[0m:
────────────────────────────────────────────────────────────



Unnamed: 0,name,rank,year,company.founded,company.name,company.relationship,company.sector,company.type,demographics.age,demographics.gender,location.citizenship,location.country code,location.gdp,location.region,wealth.type,wealth.worth in billions,wealth.how.category,wealth.how.from emerging,wealth.how.industry,wealth.how.inherited,wealth.how.was founder,wealth.how.was political
366,Carrie Perrodo,122,2014,1975,Perenco,relation,oil,new,63,female,France,FRA,0.0,Europe,inherited,10.0,0,True,Other,spouse/widow,True,True
155,Lakshmi Mittal,52,2014,1989,ArcelorMittal,founder,steel,new,63,male,India,IND,0.0,South Asia,inherited,16.7,Resource Related,True,Non-consumer industrial,father,True,True
269,Rudolf Oetker,90,2001,1891,Oetker-Gruppe,relation,food,new,0,male,Germany,DEU,1950000000000.0,Europe,inherited,4.4,Traded Sectors,True,Consumer,3rd generation,True,True
2443,Aydin Dogan,1465,2014,1961,Dogan Holding,founder,media,new,77,male,Turkey,TUR,0.0,Middle East/North Africa,founder non-finance,1.1,Non-Traded Sectors,True,Media,not inherited,True,True
1139,Gustavo Cisneros,375,2014,1929,Cisneros Group,relation,media,new,68,male,Venezuela,VEN,0.0,Latin America,inherited,4.0,Non-Traded Sectors,True,Media,father,True,True


## Features and their initial types

In [91]:
print((f"The {F_BOLD}string{F_END} and/or {F_BOLD}categorical{F_END} features" +
       f" include:\n{F_DIV}\n{df.select_dtypes(include=['object']).dtypes}\n"))

print((f"The {F_BOLD}numerical{F_END} features include:\n{F_DIV}\n" +
       f"{df.select_dtypes(include=['int64', 'float64']).dtypes}\n"))

print((f"There are also several {F_BOLD}boolean{F_END} features, including:\n" +
       f"{F_DIV}\n{df.select_dtypes(include=['bool']).dtypes}\n"))

The [1mstring[0m and/or [1mcategorical[0m features include:
────────────────────────────────────────────────────────────
name                     object
company.name             object
company.relationship     object
company.sector           object
company.type             object
demographics.gender      object
location.citizenship     object
location.country code    object
location.region          object
wealth.type              object
wealth.how.category      object
wealth.how.industry      object
wealth.how.inherited     object
dtype: object

The [1mnumerical[0m features include:
────────────────────────────────────────────────────────────
rank                          int64
year                          int64
company.founded               int64
demographics.age              int64
location.gdp                float64
wealth.worth in billions    float64
dtype: object

There are also several [1mboolean[0m features, including:
────────────────────────────────────────────────────

# Data cleaning up
This includes all data cleanse measures, including:
<ul>
    <li>Identification and correction of spelling errors</li>
    <li>Making common categories (values in categorical features) consistent</li>
    <li>Filling in missing data</li>
    <li>Removing duplicates</li>
    <li>Inspecting outliers</li>
    <li>Casting features to the suitable types according to the data that they contain</li>
</ul>

## Processing the Boolean features
All of these features contain only a single value (True), and as a result, they do not mandate any data cleanse tasks.

### wealth.how.from emerging

In [92]:
df['wealth.how.from emerging'].unique()

array([ True])

### wealth.how.was political

In [93]:
df['wealth.how.was political'].unique()

array([ True])

### wealth.how.was founder

In [94]:
df['wealth.how.was founder'].unique()

array([ True])

## Processing the categorical features

### Common pre-processings
These features generally need the following <b>pre-processings</b>:
<ul>
    <li>Removing trailing and leading spaces from names</li>
    <li>Removing all consecutive spaces in the middle of the names</li>
    <li>Removing all extra dots</li>
    <li>Capitalizing the first letter of each word</li>
    <li>Transforming 'and' and '/' to ','
    <li>Correcting the spelling errors and typos</li>
</ul>

In [95]:
# The piece of code below performs all mentioned pre-processings except for
# Spelling errors correction which needs to be done case by case

for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].apply(
        lambda x: ' '.join(
            x.replace('..', '.')
             .title()
             .replace(' And ', ', ')
             .replace('/', ', ')
             .split())
            if isinstance(x, str) else x
    )

In [96]:
# The following function receives a lists of words and maps the similar ones
# together; it is used to clean up categorical features by pinpointing the
# 'almost' similar words and consequently helping in the identification of the
# spelling errors.

from difflib import SequenceMatcher

def similar(word_list, similarity_degree = .8):
    for i in range(len(word_list)):
        first_word = word_list[i]
        if pd.isna(first_word):
            continue
        res = []
        for j in range(i+1, len(word_list)):
            second_word = word_list[j]
            if pd.isna(second_word):
                continue
            if SequenceMatcher(
                None, 
                first_word, second_word).ratio() > similarity_degree:
                res = res + [second_word]
        if len(res) > 0:
            print(first_word, '\n', '-'*30)
            print(res, '\n\n')

### Checking and correcting spelling errors

#### wealth.how.inherited

In [97]:
df['wealth.how.inherited'].value_counts(dropna=False)

Not Inherited               1688
Father                       558
3Rd Generation               210
4Th Generation                68
Spouse, Widow                 59
5Th Generation Or Longer      31
Name: wealth.how.inherited, dtype: int64

#### wealth.how.industry

In [98]:
df['wealth.how.industry'].value_counts(dropna=False)

Consumer                            471
Retail, Restaurant                  281
Real Estate                         280
Money Management                    249
Media                               219
Technology-Computer                 208
Diversified Financial               167
Energy                              132
Technology-Medical                  111
Non-Consumer Industrial             107
Constrution                          97
Mining, Metals                       90
Other                                83
Hedge Funds                          67
Private Equity, Leveraged Buyout     25
0                                    16
Venture Capital                       8
Banking                               1
Services                              1
NaN                                   1
Name: wealth.how.industry, dtype: int64

#### wealth.how.category

In [99]:
df['wealth.how.category'].value_counts(dropna=False)

Financial             800
Non-Traded Sectors    597
Traded Sectors        564
New Sectors           319
Resource Related      245
0                      85
NaN                     1
Finance                 1
Trucking                1
Energy                  1
Name: wealth.how.category, dtype: int64

#### wealth.type

In [100]:
df['wealth.type'].value_counts(dropna=False)

Inherited                953
Founder Non-Finance      713
Self-Made Finance        500
Privatized, Resources    236
Executive                190
NaN                       22
Name: wealth.type, dtype: int64

#### location.region

In [101]:
df['location.region'].value_counts(dropna=False)

North America                992
Europe                       698
East Asia                    535
Latin America                182
Middle East, North Africa    117
South Asia                    69
Sub-Saharan Africa            20
0                              1
Name: location.region, dtype: int64

#### location.country code

In [102]:
df['location.country code'].value_counts(dropna=False)

Usa    903
Deu    160
Chn    153
Rus    119
Jpn     96
      ... 
Bmu      1
Swz      1
Ago      1
Bhr      1
Tza      1
Name: location.country code, Length: 74, dtype: int64

#### location.citizenship

In [103]:
df['location.citizenship'].value_counts(dropna=False)

United States    903
Germany          160
China            153
Russia           119
Japan             96
                ... 
Bahrain            1
Ecuador            1
Georgia            1
Bermuda            1
Tanzania           1
Name: location.citizenship, Length: 73, dtype: int64

#### demographics.gender

In [104]:
df['demographics.gender'].value_counts(dropna=False)

Male              2328
Female             249
NaN                 34
Married Couple       3
Name: demographics.gender, dtype: int64

#### company.type

In [105]:
df['company.type'].value_counts(dropna=False)

New                       2302
Aquired                    196
Privatization               42
NaN                         36
Subsidiary                   9
Acquired                     7
State Owned Enterprise       7
New, Aquired                 3
Merger                       3
Franchise                    2
New, Privitization           2
New Division                 1
Privatized                   1
Neew                         1
Franchise Rights             1
Joint Venture                1
Name: company.type, dtype: int64

In [106]:
df['company.type'] = df['company.type'].replace(
    {
        'Aquired': 'Acquired',
        'Neew': 'New',
        'New, Privitization': 'New, Privatization',
    }
)

#### company.sector

In [107]:
df['company.sector'].sort_values().unique()

array(['Advertising', 'Aerospace, Defense', 'Agribusiness',
       'Agriculteral', 'Agricultural Products', 'Agriculture',
       'Aigriculture', 'Air Compressors', 'Aircraft Leasing', 'Airline',
       'Airplanes', 'Airport', 'Airport Maintenance', 'Airports',
       'Alcohol', 'Aluminum', 'Aluminum, Oil', 'Animal Feed', 'Animation',
       'Antennas', 'Aplliances', 'Apparel', 'Apparel Retail',
       'Apparel, Eyewear', 'Appliances', 'Architectural Products',
       'Armaments', 'Art Dealing', 'Asset Management', 'Auto Components',
       'Auto Dealerships', 'Auto Engines', 'Auto Glass', 'Auto Parts',
       'Auto Repair', 'Auto Retail', 'Auto Sales, Energy',
       'Automobile Dealers', 'Automobiles', 'Automotive',
       'Automotive Components', 'Automotive Parts', 'Automotives',
       'Autos', 'Aviation', 'Banannas', 'Bank', 'Banking',
       'Banking, Insurance', 'Banking, Media', 'Banking, Oil, Aluminum',
       'Banking, Publishing', 'Banking, Real Estate',
       'Beauty, Hea

In [108]:
similar(df["company.sector"].sort_values().unique())

Agricultural Products 
 ------------------------------
['Architectural Products'] 


Agriculture 
 ------------------------------
['Aigriculture'] 


Airport 
 ------------------------------
['Airports'] 


Animation 
 ------------------------------
['Aviation'] 


Aplliances 
 ------------------------------
['Appliances'] 


Asset Management 
 ------------------------------
['Waste Management'] 


Auto Components 
 ------------------------------
['Automotive Components'] 


Auto Repair 
 ------------------------------
['Auto Retail'] 


Automobiles 
 ------------------------------
['Automotives'] 


Automotive 
 ------------------------------
['Automotives'] 


Automotive Parts 
 ------------------------------
['Automotives'] 


Beer, Food Distribution 
 ------------------------------
['Food Distribution'] 


Biopharmaceutical 
 ------------------------------
['Pharmaceuticals'] 


Brake Systems 
 ------------------------------
['Braking Systems'] 


Broadcasting 
 -------------------

In [109]:
df['company.sector'] = df['company.sector'].replace(
    {
        'Agriculteral': 'Agricultural',
        'Aigriculture': 'Agriculture',
        'Aplliances': 'Appliances',
        'Comodities': 'Commodities',
        'Construcion': 'Construction',
        'Electonics': 'Electronics',
        'Electonics Components': 'Electronic Components',
        'Fertalizers': 'Fertilizers',
        'Finace': 'Finance',
        'Gambing': 'Gambling',
        'Invetsments': 'Investments',
        'Mutal Funds': 'Mutual Funds',
        'Telecomm': 'Telecom',
        'Electronic Componants': 'Electronic Components',
        'Food And Energy Processess': 'Food And Energy Processes',
        'Insurancei, Power': 'Insurance, Power',
    }
)

#### company.relationship

In [110]:
df['company.relationship'].sort_values().unique()

array(['Ceo', 'Chairman', 'Chairman Of Management Committee',
       'Chairman Of The Board', 'Chairman, Ceo',
       'Chairman, Chief Executive Officer', 'Chairman, Founder',
       'Chairman, Shareholder', 'Chairwomen', 'Chariman',
       'Chief Executive', 'Co-Chairman',
       'Co-Director Of Zinc, Copper, Lead', 'Coo', 'Deputy Chairman',
       'Director', 'Employee', 'Exectuitve Director',
       'Executive Chairman', 'Former Ceo', 'Former Chairman, Ceo',
       'Founder', 'Founder Ceo Owner', 'Founder, Ceo',
       'Founder, Chairman', 'Founder, Chairman, Ceo',
       'Founder, Chairwoman', 'Founder, Chairwoman, Ceo',
       'Founder, Executive Chairman', 'Founder, Executive Vice Chairman',
       'Founder, President', 'Founder, Relation',
       'Founder, Vice Chairman', 'General Director',
       'Global Head Of Real Estate', 'Head Of Board Of Directors',
       'Head Of High-Yield Bond Trading Dept',
       "Head Of Microsoft'S Application Software Group",
       'Honorary Pr

In [111]:
similar(df["company.relationship"].sort_values().unique())

Chairman 
 ------------------------------
['Chariman', 'Co-Chairman'] 


Former Chairman, Ceo 
 ------------------------------
['Founder, Chairman, Ceo', 'Founder, Chairwoman, Ceo'] 


Founder, Chairman 
 ------------------------------
['Founder, Chairman, Ceo', 'Founder, Chairwoman', 'Founder, Chairwoman, Ceo', 'Founder, Vice Chairman'] 


Founder, Chairman, Ceo 
 ------------------------------
['Founder, Chairwoman', 'Founder, Chairwoman, Ceo'] 


Founder, Chairwoman 
 ------------------------------
['Founder, Chairwoman, Ceo', 'Founder, Vice Chairman'] 


Founder, Executive Chairman 
 ------------------------------
['Founder, Executive Vice Chairman', 'Founder, Vice Chairman'] 


Founder, Executive Vice Chairman 
 ------------------------------
['Founder, Vice Chairman'] 


Inventor 
 ------------------------------
['Investor'] 


Lawer 
 ------------------------------
['Lawyer'] 


Relation, Chairman 
 ------------------------------
['Relation, Vice Chairman'] 


Vice Chairman 
 --

In [112]:
df['company.relationship'] = df['company.relationship'].replace(
    {
        'Chariman': 'Chairman',
        'Lawer': 'Lawyer',
        'Vice-Chairman': 'Vice Chairman',
    }
)

### Common post-processings
The features require the following <b>post-processings</b>:
<ul>
    <li>Filling in the missing values (NAs) with 'Unspecified'</li>
    <li>Replacing the value '0' with 'Unspecified'</li>
    <li>Casting the variable from string (object) to categorical</li>
</ul>

In [113]:
# The piece of code below performs all mentioned post-processings for all the
# string features except for 'name' which is not supposed to be transformed
# to categorical


for col in df.select_dtypes(include=['object']).columns:
    if col == 'name':
        continue
    
    # Replacing the value of '0' with Null
    df[col] = df[col].replace(
        {
            '0': None,
        }
    )

    # Filling Null values with the value 'Unspecified'
    df[col] = df[col].fillna('Unspecified')

    # Casting the feature into categorical type
    df[col] = df[col].astype('category')

### name

Listing possible inconsistancies where the names are different, but the following features are the same:
<ul>
    <li>company.founded</li>
    <li>company.name</li>
    <li>company.relationship</li>
    <li>company.sector</li>
    <li>company.type</li>
    <li>demographics.gender</li>
    <li>location.citizenship</li>
    <li>location.country code</li>
    <li>location.region</li>
    <li>wealth.type</li>
    <li>wealth.how.category</li>
    <li>wealth.how.from emerging</li>
    <li>wealth.how.industry</li>
    <li>wealth.how.inherited</li>
</ul>

In [None]:
dup_cols = ['company.founded',
            'company.name',
            'company.relationship',
            'company.sector',
            'company.type',
            'demographics.gender',
            'location.citizenship',
            'location.country code',
            'location.region',
            'wealth.type',
            'wealth.how.category',
            'wealth.how.from emerging',
            'wealth.how.industry',
            'wealth.how.inherited']

dup_rows = df[df.duplicated(subset=dup_cols)]
dup_rows[duprows[]]

In [None]:
import re

df[df['name'].apply(lambda x: (re.search('abraham', x, re.IGNORECASE) is not None))]

In [None]:
import re

df[df['name'].apply(lambda x: (re.search('abraham', x, re.IGNORECASE) is not None))]