# Notebook to look at Nobel prize winners

In [None]:
#standard imports
import numpy as np
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print('Setup Complete')
import pandas_profiling
from pandas_profiling import ProfileReport

In [None]:
#read in data and print first rows
df = pd.read_csv('nobel_final.csv')
df.head()

In [None]:
df['age_range'] = pd.cut(df['age_get_prize'], bins=8)
df['year_range'] = pd.cut(df['year'], bins=8, precision=0)

In [None]:
#Where are the majority of winners born
plt.figure(figsize=(28,6))
sns.countplot(x ='born_country_code', data = df, order = df['born_country_code'].value_counts().iloc[:10].index)

In [None]:
#Where did the majority of winners go to university
plt.figure(figsize=(28,6))
sns.countplot(x ='country_of_university', data = df, order = df['country_of_university'].value_counts().iloc[:10].index)
plt.xticks(rotation=45)

In [None]:
top_countries = df[(df.country_of_university.isin(['USA', 'United Kingdom','Germany', 'France'])) | 
                   (df.born_country_code.isin(['US', 'GB', 'DE', 'FR']))]

In [None]:
plt.figure(figsize=(28,16))
sns.stripplot(x=top_countries['country_of_university'], y=top_countries['born_country_code'])
plt.xticks(rotation=45)
plt.title("Country of birth v Country of uni")
plt.xlabel("University")
plt.ylabel("Birth")

In [None]:
#what is the male to female ratio
sns.countplot(data = df, x = 'gender')

In [None]:
#what is the category split by gender
plt.figure(figsize=(12,6))
sns.countplot(data = df, x = 'category', hue = 'gender')

In [None]:
#what is the age distribution for winners, split by gender
#sns.histplot(df['age_get_prize'])
sns.histplot(data=df, x='age_get_prize', hue='gender')

In [None]:
#how has the male to female ratio changed over the years
plt.figure(figsize=(18,6))
sns.countplot(data=df, x='year_range', hue='gender')

In [None]:
#how has the male to female count changed over the years
plt.figure(figsize=(18,6))
sns.countplot(data=df, x='gender', hue='year_range')

In [None]:
#how has the category changed over the years
plt.figure(figsize=(18,6))
sns.countplot(data=df, x='category', hue='year_range')

In [None]:
#how does the average age change over the years and is this different per gender
age_gender_year = df[['year_range', 'age_get_prize', 'gender']]
#age_year_group = age_gender_year.groupby(['year_range']).mean().reset_index()
#age_year_group = age_gender_year.groupby(['year_range']).mean().plot()
age_year_group = age_gender_year.groupby(['year_range', 'gender']).mean().reset_index()
plt.figure(figsize=(18,6))
sns.pointplot(x="year_range", y="age_get_prize", data=age_year_group, hue='gender')

In [None]:
#what is the gender split across the country born (top 4 countries by count)
plt.figure(figsize=(28,6))
sns.countplot(x ='born_country_code', data = df, order = df['born_country_code'].value_counts().iloc[:4].index, 
              hue = 'gender')

In [None]:
#what is the gender split across the country of uni (top 4 countries by count)
plt.figure(figsize=(28,6))
sns.countplot(x ='country_of_university', data = df, 
              order = df['country_of_university'].value_counts().iloc[:4].index, hue = 'gender')

In [None]:
#what is the spread of age of award for each category
plt.figure(figsize=(18,6))
sns.swarmplot(x=df['category'], y=df['age'])

In [None]:
#what is the spread of category for each age of award
plt.figure(figsize=(16,6))
sns.stripplot(x=df['age_range'], y=df['category'])

In [None]:
#age of award and country born (using top countries)
plt.figure(figsize=(16,16))
sns.stripplot(x=top_countries['age_range'], y=top_countries['born_country_code'])

In [None]:
#country born and category
plt.figure(figsize=(16,16))
sns.stripplot(x=top_countries['category'], y=top_countries['born_country_code'])

In [None]:
#introducing continents
continent_list1 = pd.DataFrame({'born_country_code': ['DZ','EG','ET','GH','KE','LR','MA','MG','NG','ZA','ZW','AR','BR',
                                            'CA','CL','CO','CR','GP','GT','LC','MX','PE','TT','US','VE','AZ','BD',
                                            'CN','CY','ID','IL','IN','IQ','IR','JP','KR','MM','PK','RU','TR','TW',
                                            'VN','YE','AT','BA','BE','BG','BY','CH','CZ','DE','DK','ES','FI','FR',
                                            'GB','GR','HR','HU','IE','IS','IT','LT','LU','LV',
                                            'MK','NL','NO','PL','PT','RO','SE','SI','SK','UA','AU','NZ','CD','TL'],
                    'born_continent': ['Africa','Africa','Africa','Africa','Africa','Africa','Africa','Africa','Africa',
                             'Africa','Africa','Americas','Americas','Americas','Americas','Americas','Americas',
                             'Americas','Americas','Americas','Americas','Americas','Americas','Americas','Americas',
                             'Asia','Asia','Asia','Asia','Asia','Asia','Asia','Asia','Asia','Asia','Asia','Asia',
                             'Asia','Asia','Asia','Asia','Asia','Asia','Europe','Europe','Europe','Europe','Europe',
                             'Europe','Europe','Europe','Europe','Europe','Europe','Europe','Europe','Europe','Europe'
                             ,'Europe','Europe','Europe','Europe','Europe','Europe','Europe','Europe','Europe',
                             'Europe','Europe','Europe','Europe','Europe','Europe','Europe','Europe','Oceania',
                             'Oceania','Africa','Asia']})
                             
continent_list2 = pd.DataFrame({'country_of_university': ['Germany','Canada','theNetherlands','USSR(nowRussia)',
                                                          'India','Finland','Portugal','Germany(nowFrance)',
                                                          'Austria','France','Belgium','Japan',
                                                          'Czechoslovakia(nowCzechRepublic)','USA','UnitedKingdom',
                                                          'Argentina','Russia','Ireland','Spain','Sweden','Israel',
                                                          'Italy','Switzerland','Tunisia','Australia','China',
                                                          'Denmark','Hungary','Norway'],
                    'uni_continent': ['Europe','Americas','Europe','Asia','Asia','Europe','Europe','Europe','Europe',
                                      'Europe','Europe','Asia','Europe','Americas','Europe','Americas','Asia','Europe'
                                      ,'Europe','Europe','Asia','Europe','Europe','Africa','Oceania','Asia','Europe',
                                      'Europe','Europe']})

df = df.merge(continent_list1)
df = df.merge(continent_list2)

In [None]:
#age of award and continent
plt.figure(figsize=(16,6))
sns.stripplot(x=df['age_range'], y=df['born_continent'])

In [None]:
#continent and category
plt.figure(figsize=(16,6))
sns.stripplot(x=df['category'], y=df['born_continent'])

In [None]:
plt.figure(figsize=(16,6))
sns.stripplot(x=df['born_continent'], y=df['uni_continent'])

In [None]:
df.profile_report()

In [None]:
profile = ProfileReport(df, title='Pandas Profile Report')

profile.to_file('profile_report.html')

In [None]:
#what can we predict?