# Notebook to look into the Forbes list of billionaires to see what it tells us

In [None]:
import numpy as np
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print('Setup Complete')

In [None]:
# read in the data
df = pd.read_csv('forbes_2022.csv', index_col=[0])

In [None]:
# create new columns too show if these are individuals or if the source comes from multiple avenues and group ages
df['multi_people'] = df['name'].str.contains('&') | df['name'].str.contains(' and ')
df['multi_source'] = df['source'].str.contains(',')
df['age_bins'] = pd.cut(df.age,bins = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
df.head()

# Which industry has the most billionaires

In [None]:
plt.figure(figsize=(24,8))
sns.countplot(x ='industry', data = df, order = df['industry'].value_counts().index)
plt.xticks(rotation=45)

In [None]:
max_ind = df['industry'].value_counts().idxmax()
max_num = df['industry'].value_counts().max()
print(f'The industry with the most billionaires is {max_ind}with {max_num} billionaires in this area.')

# What is the age split of billionaires

In [None]:
plt.figure(figsize=(24,8))
sns.countplot(x ='age_bins', data = df)

In [None]:
max_age = df['age'].max()
min_age = df['age'].min()
age_range = df['age_bins'].value_counts().idxmax()
print(f'The youngest billionare is {min_age} years old and the oldest is {max_age} years old.')      
print(f'Most billionaires are in the range {age_range.left} to {age_range.right} years old.')

# Where are the billioniares

In [None]:
# create new columns to count where the billionaires are
country_count = df['country'].value_counts().rename_axis('country').reset_index(name='counts')
country_count['area'] = np.where(country_count['counts']<20, 'Rest of world', country_count['country'])
country_count['counts'] = np.where(country_count['area']=='Rest of world',
                country_count.area.map(lambda desc: 'Rest of world' in desc).sum(), country_count['counts'])
df = df.merge(country_count).sort_values(by=['counts'], ascending=False)

In [None]:
plt.figure(figsize=(24,8))
sns.barplot(x=df['area'], y=df['counts'])
plt.xticks(rotation=45)

In [None]:
co_max = df['country'].value_counts(normalize=True).idxmax()
co_max_perc = ((df['country'].value_counts(normalize=True).max())*100).round(1)
print(f'{co_max_perc}% of billionaires are from {co_max}.')

# Are most billionaires individuals

In [None]:
plt.figure(figsize=(14,7))
sns.countplot(x='multi_people', data=df)

In [None]:
indiv = df['multi_people'].value_counts().idxmin()
print(f'It is {indiv} that most billionaires are individuals.')

# Do most billionaires have a single source

In [None]:
plt.figure(figsize=(14, 7))
sns.countplot(x='multi_source', data=df)

In [None]:
single = df['multi_source'].value_counts().idxmin()
print('it is {single} that most billionares have a single source.')