In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re # The RegEx module used for string operations

In [123]:
# Defining some text formatting constants:

# Start making the text bold
F_BOLD = '\033[1m'

# Start underlining the text
F_UNDERLINE = '\033[4m'

# End text formatting
F_END = '\033[0m'

# Divider line beneath text
F_DIV = '─'*60

# Loading the dataset

In [124]:
df = pd.read_csv('./billionaires.csv')

In [125]:
print((f"\nThe dataset consists of {F_BOLD}%d instances{F_END}" +
       f" and {F_BOLD}%d features{F_END}:\n{F_DIV}\n")
      %df.shape)

# Setting this option enables the presentation of all features instead of
# omitting the middle ones:
from IPython.display import display
with pd.option_context('display.max_columns', None):
    display(df.sample(5)) # Retrieves random instances from the dataset


The dataset consists of [1m2614 instances[0m and [1m22 features[0m:
────────────────────────────────────────────────────────────



Unnamed: 0,name,rank,year,company.founded,company.name,company.relationship,company.sector,company.type,demographics.age,demographics.gender,location.citizenship,location.country code,location.gdp,location.region,wealth.type,wealth.worth in billions,wealth.how.category,wealth.how.from emerging,wealth.how.industry,wealth.how.inherited,wealth.how.was founder,wealth.how.was political
1475,Victor Pinchuk,506,2014,1983,Interpipe,founder,pipelines,new,53,male,Ukraine,UKR,0.0,Europe,privatized and resources,3.2,Resource Related,True,Non-consumer industrial,not inherited,True,True
461,Saleh Kamel,151,2001,1969,Dallah al Baraka Group,founder,airport maintenance,new,59,male,Saudi Arabia,SAU,183000000000.0,Middle East/North Africa,self-made finance,3.0,Financial,True,Money Management,not inherited,True,True
1246,Murat Ulker,408,2014,1944,Yildiz Holding,relation,food products,new,54,male,Turkey,TUR,0.0,Middle East/North Africa,inherited,3.7,Traded Sectors,True,Consumer,father,True,True
2392,Rakesh Jhunjhunwala,1372,2014,1968,Lupin,investor,pharmaceuticals,new,53,male,India,IND,0.0,South Asia,self-made finance,1.2,Financial,True,Other,not inherited,True,True
2399,Tu Jianhua,1372,2014,0,Lancing Motor,chairman,motorcycles,new,50,male,China,CHN,0.0,East Asia,executive,1.2,Traded Sectors,True,Consumer,not inherited,True,True


## Features and their initial types

In [126]:
print((f"The {F_BOLD}string{F_END} and/or {F_BOLD}categorical{F_END} features" +
       f" include:\n{F_DIV}\n{df.select_dtypes(include=['object']).dtypes}\n"))

print((f"The {F_BOLD}numerical{F_END} features include:\n{F_DIV}\n" +
       f"{df.select_dtypes(include=['int64', 'float64']).dtypes}\n"))

print((f"There are also several {F_BOLD}boolean{F_END} features, including:\n" +
       f"{F_DIV}\n{df.select_dtypes(include=['bool']).dtypes}\n"))

The [1mstring[0m and/or [1mcategorical[0m features include:
────────────────────────────────────────────────────────────
name                     object
company.name             object
company.relationship     object
company.sector           object
company.type             object
demographics.gender      object
location.citizenship     object
location.country code    object
location.region          object
wealth.type              object
wealth.how.category      object
wealth.how.industry      object
wealth.how.inherited     object
dtype: object

The [1mnumerical[0m features include:
────────────────────────────────────────────────────────────
rank                          int64
year                          int64
company.founded               int64
demographics.age              int64
location.gdp                float64
wealth.worth in billions    float64
dtype: object

There are also several [1mboolean[0m features, including:
────────────────────────────────────────────────────

# Data cleaning up
This includes all data cleanse measures, including:
<ul>
    <li>Identification and correction of spelling errors</li>
    <li>Making common categories (values in categorical features) consistent</li>
    <li>Filling in missing data</li>
    <li>Removing duplicates</li>
    <li>Inspecting outliers</li>
    <li>Casting features to the suitable types according to the data that they contain</li>
</ul>

## Processing the Boolean features
All of these features contain only a single value (True), and as a result, they do not mandate any data cleanse tasks.

### wealth.how.from emerging

In [127]:
df['wealth.how.from emerging'].unique()

array([ True])

### wealth.how.was political

In [128]:
df['wealth.how.was political'].unique()

array([ True])

### wealth.how.was founder

In [129]:
df['wealth.how.was founder'].unique()

array([ True])

## Processing the categorical features
These features generally need the following retouches:
<ul>
    <li>Correcting the spelling errors and typos</li>
    <li>Removing trailing and leading spaces from names</li>
    <li>Removing all consecutive spaces in the middle of the names</li>
    <li>Removing all extra dots</li>
    <li>Capitalizing the first letter of each word</li>
    <li>Transforming 'and' and '/' to ','
    <li>Filling in the missing values (NAs) with 'Unspecified'</li>
    <li>Filling in the value '0' with 'Unspecified'</li>
    <li>Casting the variable from string (object) to categorical</li>
</ul>

In [130]:
# The piece of code below performs all mentioned refactorings except for
# Spelling errors correction which needs to be done case by case

for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].apply(
        lambda x: ' '.join(
            x.replace('..', '.')
             .title()
             .replace(' And ', ', ')
             .replace('/', ', ')
             .split())
            if isinstance(x, str) else x
    )

In [131]:
from difflib import SequenceMatcher

# The following function receives a lists of words and maps the similar ones
# together; it is used to clean up categorical features by pinpointing the
# 'almost' similar words and consequently helping in the identification of the
# spelling errors.

def similar(word_list, similarity_degree = .8):
    for i in range(len(word_list)):
        first_word = word_list[i]
        if pd.isna(first_word):
            continue
        res = []
        for j in range(i+1, len(word_list)):
            second_word = word_list[j]
            if pd.isna(second_word):
                continue
            if SequenceMatcher(
                None, 
                first_word, second_word).ratio() > similarity_degree:
                res = res + [second_word]
        if len(res) > 0:
            print(first_word, '\n', '-'*30)
            print(res, '\n\n')

### wealth.how.inherited

In [132]:
df['wealth.how.inherited'].value_counts(dropna=False)

Not Inherited               1688
Father                       558
3Rd Generation               210
4Th Generation                68
Spouse, Widow                 59
5Th Generation Or Longer      31
Name: wealth.how.inherited, dtype: int64

In [133]:
# Casting the feature into categorical type
df['wealth.how.inherited'] = df['wealth.how.inherited'].astype('category')

df['wealth.how.inherited'].dtype

CategoricalDtype(categories=['3Rd Generation', '4Th Generation',
                  '5Th Generation Or Longer', 'Father', 'Not Inherited',
                  'Spouse, Widow'],
, ordered=False)

### wealth.how.industry

In [134]:
df['wealth.how.industry'].value_counts(dropna=False)

Consumer                            471
Retail, Restaurant                  281
Real Estate                         280
Money Management                    249
Media                               219
Technology-Computer                 208
Diversified Financial               167
Energy                              132
Technology-Medical                  111
Non-Consumer Industrial             107
Constrution                          97
Mining, Metals                       90
Other                                83
Hedge Funds                          67
Private Equity, Leveraged Buyout     25
0                                    16
Venture Capital                       8
Banking                               1
Services                              1
NaN                                   1
Name: wealth.how.industry, dtype: int64

In [135]:
# Replacing the value of '0' with 'Unspecified'
df['wealth.how.industry'] = df['wealth.how.industry'].replace(
    {
        '0': 'Unspecified',
    }
)

# Filling Null values with the value 'Unspecified'
df['wealth.how.industry'] = df['wealth.how.industry'].fillna('Unspecified')

# Casting the feature into categorical type
df['wealth.how.industry'] = df['wealth.how.industry'].astype('category')

df['wealth.how.industry'].dtype

CategoricalDtype(categories=['Banking', 'Constrution', 'Consumer',
                  'Diversified Financial', 'Energy', 'Hedge Funds', 'Media',
                  'Mining, Metals', 'Money Management',
                  'Non-Consumer Industrial', 'Other',
                  'Private Equity, Leveraged Buyout', 'Real Estate',
                  'Retail, Restaurant', 'Services', 'Technology-Computer',
                  'Technology-Medical', 'Unspecified', 'Venture Capital'],
, ordered=False)

### wealth.how.category

In [136]:
df['wealth.how.category'].value_counts(dropna=False)

Financial             800
Non-Traded Sectors    597
Traded Sectors        564
New Sectors           319
Resource Related      245
0                      85
NaN                     1
Finance                 1
Trucking                1
Energy                  1
Name: wealth.how.category, dtype: int64

In [137]:
# Replacing the value of '0' with 'Unspecified'
df['wealth.how.category'] = df['wealth.how.category'].replace(
    {
        '0': 'Unspecified',
    }
)

# Filling Null values with the value 'Unspecified'
df['wealth.how.category'] = df['wealth.how.category'].fillna('Unspecified')

# Casting the feature into categorical type
df['wealth.how.category'] = df['wealth.how.category'].astype('category')

df['wealth.how.category'].dtype

CategoricalDtype(categories=['Energy', 'Finance', 'Financial', 'New Sectors',
                  'Non-Traded Sectors', 'Resource Related', 'Traded Sectors',
                  'Trucking', 'Unspecified'],
, ordered=False)

### wealth.type

In [138]:
df['wealth.type'].value_counts(dropna=False)

Inherited                953
Founder Non-Finance      713
Self-Made Finance        500
Privatized, Resources    236
Executive                190
NaN                       22
Name: wealth.type, dtype: int64

In [139]:
# Filling Null values with the value 'Unspecified'
df['wealth.type'] = df['wealth.type'].fillna('Unspecified')

# Casting the feature into categorical type
df['wealth.type'] = df['wealth.type'].astype('category')

df['wealth.type'].dtype

CategoricalDtype(categories=['Executive', 'Founder Non-Finance', 'Inherited',
                  'Privatized, Resources', 'Self-Made Finance', 'Unspecified'],
, ordered=False)

### location.region

In [140]:
df['location.region'].value_counts(dropna=False)

North America                992
Europe                       698
East Asia                    535
Latin America                182
Middle East, North Africa    117
South Asia                    69
Sub-Saharan Africa            20
0                              1
Name: location.region, dtype: int64

In [141]:
df['location.region'] = df['location.region'].replace(
    {
        '0': 'Unspecified'
    }
)

# Casting the feature into categorical type
df['location.region'] = df['location.region'].astype('category')

df['location.region'].dtype

CategoricalDtype(categories=['East Asia', 'Europe', 'Latin America',
                  'Middle East, North Africa', 'North America', 'South Asia',
                  'Sub-Saharan Africa', 'Unspecified'],
, ordered=False)

### location.country code

In [142]:
df['location.country code'].value_counts(dropna=False)

Usa    903
Deu    160
Chn    153
Rus    119
Jpn     96
      ... 
Bmu      1
Swz      1
Ago      1
Bhr      1
Tza      1
Name: location.country code, Length: 74, dtype: int64

In [143]:
# Casting the feature into categorical type
df['location.country code'] = df['location.country code'].astype('category')

df['location.country code'].dtype

CategoricalDtype(categories=['Ago', 'Are', 'Arg', 'Aus', 'Aut', 'Bel', 'Bhr', 'Bmu',
                  'Bra', 'Can', 'Che', 'Chl', 'Chn', 'Col', 'Cyp', 'Cze',
                  'Den', 'Deu', 'Dnk', 'Dza', 'Ecu', 'Egy', 'Esp', 'Fin',
                  'Fra', 'Gbr', 'Geo', 'Ggy', 'Grc', 'Hkg', 'Idn', 'Ind',
                  'Irl', 'Isr', 'Ita', 'Jpn', 'Kaz', 'Kna', 'Kor', 'Kwt',
                  'Lbn', 'Lie', 'Ltu', 'Mac', 'Mar', 'Mco', 'Mex', 'Mys',
                  'Nga', 'Nld', 'Nor', 'Npl', 'Nzl', 'Omn', 'Per', 'Phl',
                  'Pol', 'Prt', 'Rou', 'Rus', 'Sau', 'Sgp', 'Swe', 'Swz',
                  'Taiwan', 'Tha', 'Tur', 'Tza', 'Uga', 'Ukr', 'Usa', 'Ven',
                  'Vnm', 'Zaf'],
, ordered=False)

### location.citizenship

In [144]:
df['location.citizenship'].value_counts(dropna=False)

United States    903
Germany          160
China            153
Russia           119
Japan             96
                ... 
Bahrain            1
Ecuador            1
Georgia            1
Bermuda            1
Tanzania           1
Name: location.citizenship, Length: 73, dtype: int64

In [145]:
# Casting the feature into categorical type
df['location.citizenship'] = df['location.citizenship'].astype('category')

df['location.citizenship'].dtype

CategoricalDtype(categories=['Algeria', 'Angola', 'Argentina', 'Australia', 'Austria',
                  'Bahrain', 'Belgium', 'Bermuda', 'Brazil', 'Canada', 'Chile',
                  'China', 'Colombia', 'Cyprus', 'Czech Republic', 'Denmark',
                  'Ecuador', 'Egypt', 'Finland', 'France', 'Georgia',
                  'Germany', 'Greece', 'Guernsey', 'Hong Kong', 'India',
                  'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan',
                  'Kazakhstan', 'Kuwait', 'Lebanon', 'Liechtenstein',
                  'Lithuania', 'Macau', 'Malaysia', 'Mexico', 'Monaco',
                  'Morocco', 'Nepal', 'Netherlands', 'New Zealand', 'Nigeria',
                  'Norway', 'Oman', 'Peru', 'Philippines', 'Poland',
                  'Portugal', 'Romania', 'Russia', 'Saudi Arabia', 'Singapore',
                  'South Africa', 'South Korea', 'Spain', 'St. Kitts, Nevis',
                  'Swaziland', 'Sweden', 'Switzerland', 'Taiwan', 'Tanzania',
                

### demographics.gender

In [146]:
df['demographics.gender'].value_counts(dropna=False)

Male              2328
Female             249
NaN                 34
Married Couple       3
Name: demographics.gender, dtype: int64

In [147]:
# Filling Null values with the value 'Unspecified'
df['demographics.gender'] = df['demographics.gender'].fillna('Unspecified')

# Casting the feature into categorical type
df['demographics.gender'] = df['demographics.gender'].astype('category')

df['demographics.gender'].dtype

CategoricalDtype(categories=['Female', 'Male', 'Married Couple', 'Unspecified'], ordered=False)

### company.type

In [148]:
df['company.type'].value_counts(dropna=False)

New                       2302
Aquired                    196
Privatization               42
NaN                         36
Subsidiary                   9
Acquired                     7
State Owned Enterprise       7
New, Aquired                 3
Merger                       3
Franchise                    2
New, Privitization           2
New Division                 1
Privatized                   1
Neew                         1
Franchise Rights             1
Joint Venture                1
Name: company.type, dtype: int64

In [149]:
df['company.type'] = df['company.type'].replace(
    {
        'Aquired': 'Acquired',
        'Neew': 'New',
        'New, Privitization': 'New, Privatization',
    }
)

In [150]:
df['company.type'].isna().sum()

36

In [151]:
df['company.type'] = df['company.type'].fillna('Unspecified')

### company.sector

In [152]:
df['company.sector'].sort_values().unique()

array(['Advertising', 'Aerospace, Defense', 'Agribusiness',
       'Agriculteral', 'Agricultural Products', 'Agriculture',
       'Aigriculture', 'Air Compressors', 'Aircraft Leasing', 'Airline',
       'Airplanes', 'Airport', 'Airport Maintenance', 'Airports',
       'Alcohol', 'Aluminum', 'Aluminum, Oil', 'Animal Feed', 'Animation',
       'Antennas', 'Aplliances', 'Apparel', 'Apparel Retail',
       'Apparel, Eyewear', 'Appliances', 'Architectural Products',
       'Armaments', 'Art Dealing', 'Asset Management', 'Auto Components',
       'Auto Dealerships', 'Auto Engines', 'Auto Glass', 'Auto Parts',
       'Auto Repair', 'Auto Retail', 'Auto Sales, Energy',
       'Automobile Dealers', 'Automobiles', 'Automotive',
       'Automotive Components', 'Automotive Parts', 'Automotives',
       'Autos', 'Aviation', 'Banannas', 'Bank', 'Banking',
       'Banking, Insurance', 'Banking, Media', 'Banking, Oil, Aluminum',
       'Banking, Publishing', 'Banking, Real Estate',
       'Beauty, Hea

In [153]:
similar(df["company.sector"].sort_values().unique())

Agricultural Products 
 ------------------------------
['Architectural Products'] 


Agriculture 
 ------------------------------
['Aigriculture'] 


Airport 
 ------------------------------
['Airports'] 


Animation 
 ------------------------------
['Aviation'] 


Aplliances 
 ------------------------------
['Appliances'] 


Asset Management 
 ------------------------------
['Waste Management'] 


Auto Components 
 ------------------------------
['Automotive Components'] 


Auto Repair 
 ------------------------------
['Auto Retail'] 


Automobiles 
 ------------------------------
['Automotives'] 


Automotive 
 ------------------------------
['Automotives'] 


Automotive Parts 
 ------------------------------
['Automotives'] 


Beer, Food Distribution 
 ------------------------------
['Food Distribution'] 


Biopharmaceutical 
 ------------------------------
['Pharmaceuticals'] 


Brake Systems 
 ------------------------------
['Braking Systems'] 


Broadcasting 
 -------------------

In [154]:
df['company.sector'] = df['company.sector'].replace(
    {
        'Agriculteral': 'Agricultural',
        'Aigriculture': 'Agriculture',
        'Aplliances': 'Appliances',
        'Comodities': 'Commodities',
        'Construcion': 'Construction',
        'Electonics': 'Electronics',
        'Electonics Components': 'Electronic Components',
        'Fertalizers': 'Fertilizers',
        'Finace': 'Finance',
        'Gambing': 'Gambling',
        'Invetsments': 'Investments',
        'Mutal Funds': 'Mutual Funds',
        'Telecomm': 'Telecom',
        'Electronic Componants': 'Electronic Components',
        'Food And Energy Processess': 'Food And Energy Processes',
        'Insurancei, Power': 'Insurance, Power',
    }
)

### company.relationship

In [163]:
print(df['company.relationship'].value_counts().sort_values())

Director                                             1
Coo                                                  1
Head Of High-Yield Bond Trading Dept                 1
Owner, Former Ceo                                    1
Deputy Chairman                                      1
Chairwomen                                           1
Founder, Executive Vice Chairman                     1
Inventor                                             1
Founder, Chairwoman                                  1
Vice President Of Infrastructure Software            1
Co-Director Of Zinc, Copper, Lead                    1
Global Head Of Real Estate                           1
Supervisory Board Or Directors                       1
Head Of Board Of Directors                           1
Founder, President                                   1
Founder, Chairwoman, Ceo                             1
Relation, Ceo                                        1
Shareholder                                          1
Exectuitve

In [156]:
similar(df["company.relationship"].sort_values().unique())

Chairman 
 ------------------------------
['Chariman', 'Co-Chairman'] 


Former Chairman, Ceo 
 ------------------------------
['Founder, Chairman, Ceo', 'Founder, Chairwoman, Ceo'] 


Founder, Chairman 
 ------------------------------
['Founder, Chairman, Ceo', 'Founder, Chairwoman', 'Founder, Chairwoman, Ceo', 'Founder, Vice Chairman'] 


Founder, Chairman, Ceo 
 ------------------------------
['Founder, Chairwoman', 'Founder, Chairwoman, Ceo'] 


Founder, Chairwoman 
 ------------------------------
['Founder, Chairwoman, Ceo', 'Founder, Vice Chairman'] 


Founder, Executive Chairman 
 ------------------------------
['Founder, Executive Vice Chairman', 'Founder, Vice Chairman'] 


Founder, Executive Vice Chairman 
 ------------------------------
['Founder, Vice Chairman'] 


Inventor 
 ------------------------------
['Investor'] 


Lawer 
 ------------------------------
['Lawyer'] 


Relation, Chairman 
 ------------------------------
['Relation, Vice Chairman'] 


Vice Chairman 
 --

In [157]:
df['company.relationship'] = df['company.relationship'].replace(
    {
        'Chariman': 'Chairman',
        'Lawer': 'Lawyer',
        'Vice-Chairman': 'Vice Chairman',
    }
)

### name

Listing possible inconsistancies where the names are different, but the following features are the same:
<ul>
    <li>company.founded</li>
    <li>company.name</li>
    <li>company.relationship</li>
    <li>company.sector</li>
    <li>company.type</li>
    <li>demographics.gender</li>
    <li>location.citizenship</li>
    <li>location.country code</li>
    <li>location.region</li>
    <li>wealth.type</li>
    <li>wealth.how.category</li>
    <li>wealth.how.from emerging</li>
    <li>wealth.how.industry</li>
    <li>wealth.how.inherited</li>
</ul>

In [None]:
dup_cols = ['company.founded',
            'company.name',
            'company.relationship',
            'company.sector',
            'company.type',
            'demographics.gender',
            'location.citizenship',
            'location.country code',
            'location.region',
            'wealth.type',
            'wealth.how.category',
            'wealth.how.from emerging',
            'wealth.how.industry',
            'wealth.how.inherited']

dup_rows = df[df.duplicated(subset=dup_cols)]
dup_rows[duprows[]]

In [None]:
import re

df[df['name'].apply(lambda x: (re.search('abraham', x, re.IGNORECASE) is not None))]

In [None]:
import re

df[df['name'].apply(lambda x: (re.search('abraham', x, re.IGNORECASE) is not None))]