## Imports

In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Reading Data

In [24]:

dff = pd.read_csv('archive.csv')
df = dff.copy()
df.sample(5)

Unnamed: 0,Year,Category,Prize,Motivation,Prize Share,Laureate ID,Laureate Type,Full Name,Birth Date,Birth City,Birth Country,Sex,Organization Name,Organization City,Organization Country,Death Date,Death City,Death Country
364,1963,Medicine,The Nobel Prize in Physiology or Medicine 1963,"""for their discoveries concerning the ionic me...",1/3,377,Individual,Andrew Fielding Huxley,1917-11-22,Hampstead,United Kingdom,Male,University College,London,United Kingdom,2012-05-30,Grantchester,United Kingdom
915,2013,Chemistry,The Nobel Prize in Chemistry 2013,"""for the development of multiscale models for ...",1/3,889,Individual,Martin Karplus,1930-03-15,Vienna,Austria,Male,Harvard University,"Cambridge, MA",United States of America,,,
499,1977,Physics,The Nobel Prize in Physics 1977,"""for their fundamental theoretical investigati...",1/3,107,Individual,Philip Warren Anderson,1923-12-13,"Indianapolis, IN",United States of America,Male,Bell Telephone Laboratories,"Murray Hill, NJ",United States of America,,,
661,1994,Chemistry,The Nobel Prize in Chemistry 1994,"""for his contribution to carbocation chemistry""",1/1,280,Individual,George A. Olah,1927-05-22,Budapest,Hungary,Male,University of Southern California,"Los Angeles, CA",United States of America,,,
300,1955,Literature,The Nobel Prize in Literature 1955,"""for his vivid epic power which has renewed th...",1/1,626,Individual,Halldór Kiljan Laxness,1902-04-23,Reykjavik,Iceland,Male,,,,1998-02-08,Reykjavik,Iceland


## Data Exploration and Cleaning

In [4]:
print("Shape\n%s Rows \n%s Columns " % (df.shape[0],df.shape[1]))

Shape
969 Rows 
18 Columns 


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 969 entries, 0 to 968
Data columns (total 18 columns):
Year                    969 non-null int64
Category                969 non-null object
Prize                   969 non-null object
Motivation              881 non-null object
Prize Share             969 non-null object
Laureate ID             969 non-null int64
Laureate Type           969 non-null object
Full Name               969 non-null object
Birth Date              940 non-null object
Birth City              941 non-null object
Birth Country           943 non-null object
Sex                     943 non-null object
Organization Name       722 non-null object
Organization City       716 non-null object
Organization Country    716 non-null object
Death Date              617 non-null object
Death City              599 non-null object
Death Country           605 non-null object
dtypes: int64(2), object(16)
memory usage: 136.3+ KB


In [11]:
df.isnull().sum()

Year                      0
Category                  0
Prize                     0
Motivation               88
Prize Share               0
Laureate ID               0
Laureate Type             0
Full Name                 0
Birth Date               29
Birth City               28
Birth Country            26
Sex                      26
Organization Name       247
Organization City       253
Organization Country    253
Death Date              352
Death City              370
Death Country           364
dtype: int64

In [12]:
pd.unique(df["Laureate Type"])

array(['Individual', 'Organization'], dtype=object)

In [14]:
df[df["Laureate Type"] == "Organization"].shape

(30, 18)

In [15]:
df[df["Sex"].isnull()]

Unnamed: 0,Year,Category,Prize,Motivation,Prize Share,Laureate ID,Laureate Type,Full Name,Birth Date,Birth City,Birth Country,Sex,Organization Name,Organization City,Organization Country,Death Date,Death City,Death Country
24,1904,Peace,The Nobel Peace Prize 1904,,1/1,467,Organization,Institut de droit international (Institute of ...,,,,,,,,,,
61,1910,Peace,The Nobel Peace Prize 1910,,1/1,477,Organization,Bureau international permanent de la Paix (Per...,,,,,,,,,,
90,1917,Peace,The Nobel Peace Prize 1917,,1/1,482,Organization,Comité international de la Croix Rouge (Intern...,,,,,,,,,,
206,1938,Peace,The Nobel Peace Prize 1938,,1/1,503,Organization,Office international Nansen pour les Réfugiés ...,,,,,,,,,,
222,1944,Peace,The Nobel Peace Prize 1944,,1/1,482,Organization,Comité international de la Croix Rouge (Intern...,,,,,,,,,,
244,1947,Peace,The Nobel Peace Prize 1947,,1/2,508,Organization,Friends Service Council (The Quakers),,,,,,,,,,
245,1947,Peace,The Nobel Peace Prize 1947,,1/2,509,Organization,American Friends Service Committee (The Quakers),,,,,,,,,,
295,1954,Peace,The Nobel Peace Prize 1954,,1/1,515,Organization,Office of the United Nations High Commissioner...,,,,,,,,,,
365,1963,Peace,The Nobel Peace Prize 1963,,1/2,482,Organization,Comité international de la Croix Rouge (Intern...,,,,,,,,,,
366,1963,Peace,The Nobel Peace Prize 1963,,1/2,523,Organization,Ligue des Sociétés de la Croix-Rouge (League o...,,,,,,,,,,


In [18]:
#df[df["Prize Share"] == "1/2"]
df[df["Full Name"] == "Paul Ehrlich"]

Unnamed: 0,Year,Category,Prize,Motivation,Prize Share,Laureate ID,Laureate Type,Full Name,Birth Date,Birth City,Birth Country,Sex,Organization Name,Organization City,Organization Country,Death Date,Death City,Death Country
46,1908,Medicine,The Nobel Prize in Physiology or Medicine 1908,"""in recognition of their work on immunity""",1/2,302,Individual,Paul Ehrlich,1854-03-14,Strehlen (Strzelin),Prussia (Poland),Male,Goettingen University,Göttingen,Germany,1915-08-20,Bad Homburg vor der Höhe,Germany
47,1908,Medicine,The Nobel Prize in Physiology or Medicine 1908,"""in recognition of their work on immunity""",1/2,302,Individual,Paul Ehrlich,1854-03-14,Strehlen (Strzelin),Prussia (Poland),Male,Königliches Institut für experimentelle Therap...,Frankfurt-on-the-Main,Germany,1915-08-20,Bad Homburg vor der Höhe,Germany


In [19]:

#df[df["Organization Name"].isnull()]
l = df[df["Organization Name"].isnull()]
k = l[l["Laureate Type"] == "Organization"]
print("Shape\n%s Rows \n%s Columns " % (k.shape[0],k.shape[1]))


Shape
30 Rows 
18 Columns 


## Cleaning 

### Removing Duplicates 

In [20]:
print(len(df))
df.drop_duplicates(subset = "Laureate ID",keep = "first", inplace = True)
print(len(df))
df[df["Full Name"] == "Paul Ehrlich"]

969
904


Unnamed: 0,Year,Category,Prize,Motivation,Prize Share,Laureate ID,Laureate Type,Full Name,Birth Date,Birth City,Birth Country,Sex,Organization Name,Organization City,Organization Country,Death Date,Death City,Death Country
46,1908,Medicine,The Nobel Prize in Physiology or Medicine 1908,"""in recognition of their work on immunity""",1/2,302,Individual,Paul Ehrlich,1854-03-14,Strehlen (Strzelin),Prussia (Poland),Male,Goettingen University,Göttingen,Germany,1915-08-20,Bad Homburg vor der Höhe,Germany


### Deleting Motivation values

In [25]:
del df['Motivation']
df.sample(5)

Unnamed: 0,Year,Category,Prize,Prize Share,Laureate ID,Laureate Type,Full Name,Birth Date,Birth City,Birth Country,Sex,Organization Name,Organization City,Organization Country,Death Date,Death City,Death Country
626,1990,Economics,The Sveriges Riksbank Prize in Economic Scienc...,1/3,705,Individual,Merton H. Miller,1923-05-16,"Boston, MA",United States of America,Male,University of Chicago,"Chicago, IL",United States of America,2000-06-03,"Chicago, IL",United States of America
408,1969,Chemistry,The Nobel Prize in Chemistry 1969,1/2,237,Individual,Derek H. R. Barton,1918-09-08,Gravesend,United Kingdom,Male,Imperial College,London,United Kingdom,1998-03-16,"College Station, TX",United States of America
531,1980,Medicine,The Nobel Prize in Physiology or Medicine 1980,1/3,420,Individual,Jean Dausset,1916-10-19,Toulouse,France,Male,"Université de Paris, Laboratoire Immuno-Hémato...",Paris,France,2009-06-06,"Palma, Majorca",Spain
416,1969,Peace,The Nobel Peace Prize 1969,1/1,527,Organization,International Labour Organization (I.L.O.),,,,,,,,,,
955,2015,Peace,The Nobel Peace Prize 2015,1/1,925,Organization,National Dialogue Quartet,,,,,,,,,,


### Fixing Incorrect Data of Laureate Type ( Org -> Individual )

In [26]:

#df.loc[df["Sex"].isnull(), "Sex"] = "Org"

mask = (df["Laureate Type"] == "Organization") & (df["Birth Date"].notnull())
df["Laureate Type"][mask] = "Individual"

l = df[df["Laureate Type"] == "Organization"]
k = l[l["Birth Date"].notnull()]
k

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Year,Category,Prize,Prize Share,Laureate ID,Laureate Type,Full Name,Birth Date,Birth City,Birth Country,Sex,Organization Name,Organization City,Organization Country,Death Date,Death City,Death Country


In [27]:
k = df[df["Laureate Type"] == "Organization"]
print("Shape\n%s Rows \n%s Columns " % (k.shape[0],k.shape[1]))
k
#df.isnull().sum()

Shape
26 Rows 
17 Columns 


Unnamed: 0,Year,Category,Prize,Prize Share,Laureate ID,Laureate Type,Full Name,Birth Date,Birth City,Birth Country,Sex,Organization Name,Organization City,Organization Country,Death Date,Death City,Death Country
24,1904,Peace,The Nobel Peace Prize 1904,1/1,467,Organization,Institut de droit international (Institute of ...,,,,,,,,,,
61,1910,Peace,The Nobel Peace Prize 1910,1/1,477,Organization,Bureau international permanent de la Paix (Per...,,,,,,,,,,
90,1917,Peace,The Nobel Peace Prize 1917,1/1,482,Organization,Comité international de la Croix Rouge (Intern...,,,,,,,,,,
206,1938,Peace,The Nobel Peace Prize 1938,1/1,503,Organization,Office international Nansen pour les Réfugiés ...,,,,,,,,,,
222,1944,Peace,The Nobel Peace Prize 1944,1/1,482,Organization,Comité international de la Croix Rouge (Intern...,,,,,,,,,,
244,1947,Peace,The Nobel Peace Prize 1947,1/2,508,Organization,Friends Service Council (The Quakers),,,,,,,,,,
245,1947,Peace,The Nobel Peace Prize 1947,1/2,509,Organization,American Friends Service Committee (The Quakers),,,,,,,,,,
295,1954,Peace,The Nobel Peace Prize 1954,1/1,515,Organization,Office of the United Nations High Commissioner...,,,,,,,,,,
365,1963,Peace,The Nobel Peace Prize 1963,1/2,482,Organization,Comité international de la Croix Rouge (Intern...,,,,,,,,,,
366,1963,Peace,The Nobel Peace Prize 1963,1/2,523,Organization,Ligue des Sociétés de la Croix-Rouge (League o...,,,,,,,,,,


### Dropping organization since they only represent 2.5% of the dataframe ( 23 rows out of 904 rows)

In [28]:
print("Shape\n%s Rows \n%s Columns " % (df.shape[0],df.shape[1]))
df = df[df["Laureate Type"] == "Individual"]
print("Shape\n%s Rows \n%s Columns " % (df.shape[0],df.shape[1]))
df[df["Laureate Type"] == "Individual"]
df.isnull().sum()
#df[df["Death Date"].isnull()]

Shape
969 Rows 
17 Columns 
Shape
943 Rows 
17 Columns 


Year                      0
Category                  0
Prize                     0
Prize Share               0
Laureate ID               0
Laureate Type             0
Full Name                 0
Birth Date                3
Birth City                2
Birth Country             0
Sex                       0
Organization Name       221
Organization City       227
Organization Country    227
Death Date              326
Death City              344
Death Country           338
dtype: int64

### Dropping the Death City, Death Date, Death Country as they will not affect Data visualisation in any way

In [29]:
df = df.drop("Death Date", axis=1)
df = df.drop("Death City", axis=1)
df = df.drop("Death Country", axis=1)
df.isnull().sum()


Year                      0
Category                  0
Prize                     0
Prize Share               0
Laureate ID               0
Laureate Type             0
Full Name                 0
Birth Date                3
Birth City                2
Birth Country             0
Sex                       0
Organization Name       221
Organization City       227
Organization Country    227
dtype: int64


### Adding missing birth date and city for individuals 

In [43]:
df.loc[df["Laureate ID"] == 841, "Birth Date"] = "1952-04-01" #venk
df.loc[df["Laureate ID"] == 864, "Birth Date"] = "1959-09-22" #saul
df.loc[df["Laureate ID"] == 747, "Birth City"] = "Chaguanas" # sir Vidiadhar
df.loc[df["Laureate ID"] == 855, "Birth City"] = "Changchun" # liu Xiaobo
df.isnull().sum()

Year                      0
Category                  0
Prize                     0
Prize Share               0
Laureate ID               0
Laureate Type             0
Full Name                 0
Birth Date                0
Birth City                0
Birth Country             0
Sex                       0
Organization Name         0
Organization City         0
Organization Country    213
dtype: int64

In [45]:

df.loc[df["Laureate ID"] == 6, "Organization Country"] = "self" 
df.loc[df["Laureate ID"] == 6, "Organization City"] = "self" 
df.loc[df["Laureate ID"] == 6, "Organization Name"] = "self" 

df.loc[df["Laureate ID"] == 318, "Organization Country"] = "Tunisia" 

df.loc[df["Laureate ID"] == 684, "Organization Country"] = "self" 
df.loc[df["Laureate ID"] == 684, "Organization City"] = "self" 
df.loc[df["Laureate ID"] == 684, "Organization Name"] = "self" 

df.loc[df["Laureate ID"] == 685, "Organization Country"] = "self" 
df.loc[df["Laureate ID"] == 685, "Organization City"] = "self" 
df.loc[df["Laureate ID"] == 685, "Organization Name"] = "self" 

df.loc[df["Laureate ID"] == 270, "Organization Country"] = "USA" 
df.loc[df["Laureate ID"] == 270, "Organization City"] = "Maryland" 

df.loc[df["Laureate ID"] == 461, "Organization Country"] = "USA" 
df.loc[df["Laureate ID"] == 461, "Organization City"] = "Maryland" 

df.loc[df["Laureate ID"] == 770, "Organization Country"] = "USA" 
df.loc[df["Laureate ID"] == 770, "Organization City"] = "Maryland"

df.loc[df["Laureate ID"] == 811, "Organization Country"] = "USA" 
df.loc[df["Laureate ID"] == 811, "Organization City"] = "Maryland"

df.loc[df["Laureate ID"] == 831, "Organization Country"] = "USA" 
df.loc[df["Laureate ID"] == 831, "Organization City"] = "Maryland" 

df.loc[df["Laureate ID"] == 842, "Organization Country"] = "USA" 
df.loc[df["Laureate ID"] == 842, "Organization City"] = "Maryland" 

df.loc[df["Laureate ID"] == 837, "Organization Country"] = "USA" 
df.loc[df["Laureate ID"] == 837, "Organization City"] = "Maryland" 

df.loc[df["Laureate ID"] == 878, "Organization Country"] = "USA" 
df.loc[df["Laureate ID"] == 878, "Organization City"] = "Maryland" 

df.loc[df["Laureate ID"] == 885, "Organization Country"] = "USA" 
df.loc[df["Laureate ID"] == 885, "Organization City"] = "Maryland" 

df.loc[df["Laureate ID"] == 886, "Organization Country"] = "USA" 
df.loc[df["Laureate ID"] == 886, "Organization City"] = "Maryland" 

k = df[df["Organization Country"].isnull()]
l = k[k["Category"] != "Peace"]
l[l["Category"] != "Literature"]

df[df["Organization Country"].isnull()].shape


# ba3d keda el hytba22a meen b2a? el peace wl literature bas
# homa el wa7ideen el hyb2o be null, then momken terunno el 3 lines el ta7t
#dool be el text el t7boha, take care el order yb2a keda 3shan law 3mlto 
#3 lines el ta7t dool homa el nas el physics wl mediccine msh ht3rfo tgebeehom tany

df.loc[df["Organization Name"].isnull(), "Organization Name"] = "Self"
df.loc[df["Organization City"].isnull(), "Organization City"] = "Self"
df.loc[df["Organization Country"].isnull(), "Organization Country"] = "Self"

df.isnull().sum()


Year                    0
Category                0
Prize                   0
Prize Share             0
Laureate ID             0
Laureate Type           0
Full Name               0
Birth Date              0
Birth City              0
Birth Country           0
Sex                     0
Organization Name       0
Organization City       0
Organization Country    0
dtype: int64

In [46]:
df.to_csv('archiveData_Cleaned.csv')