# 9 Cleaning Data with Pandas

## Coming Clean About Dirty Data

In [1]:
import pandas as pd

In [58]:
df = pd.read_json(open('nobel_winners/nobel_winners2.json'))

In [59]:
df.country

0        Austria
1        Austria
2               
3               
4        Austria
         ...    
321      Austria
322    Argentina
323    Argentina
324    Argentina
325    Argentina
Name: country, Length: 326, dtype: object

In [3]:
df.country.unique()

array(['Austria', '', 'Australia', 'Germany', 'Guatemala', 'Greece',
       'Hong Kong', 'Hungary', 'Ghana', 'France', 'Finland', 'Denmark',
       'East Timor', 'Egypt', 'Czech Republic',
       "China, People's Republic of", 'Colombia',
       'Congo, Democratic Republic', 'Costa Rica', 'China, Republic of',
       'Chile', 'Canada', 'Belgium', 'Bangladesh', 'Belarus', 'Argentina'],
      dtype=object)

## Inspecting the Data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 326 entries, 0 to 325
Data columns (total 12 columns):
link              326 non-null object
name              326 non-null object
year              326 non-null int64
category          326 non-null object
country           326 non-null object
born_in           326 non-null object
text              326 non-null object
date_of_birth     321 non-null object
date_of_death     240 non-null object
place_of_birth    321 non-null object
place_of_death    240 non-null object
gender            321 non-null object
dtypes: int64(1), object(11)
memory usage: 30.7+ KB


In [5]:
df.describe()

Unnamed: 0,year
count,326.0
mean,1960.01227
std,38.490596
min,1809.0
25%,1927.0
50%,1964.5
75%,1993.75
max,2018.0


In [6]:
# 9.1
df.describe(include=['object'])

Unnamed: 0,link,name,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender
count,326,326,326,326,326.0,326,321,240,321,240,321
unique,309,321,6,26,18.0,322,301,227,212,139,2
top,http://en.wikipedia.org/wiki/John_Polanyi,Vladimir Prelog *,Physiology or Medicine,Germany,,"Vladimir Prelog *, Ethnic Croat and national ...",23 January 1929,12 June 1982,Paris,Paris,male
freq,3,2,78,83,261.0,2,3,2,22,19,302


In [7]:
# 9.2
df.head()

Unnamed: 0,link,name,year,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender
0,http://en.wikipedia.org/wiki/Karl_Landsteiner,Karl Landsteiner,1930,Physiology or Medicine,Austria,,"Karl Landsteiner , Physiology or Medicine, 1930",14 June 1868,26 June 1943,Vienna,New York City,male
1,http://en.wikipedia.org/wiki/Elfriede_Jelinek,Elfriede Jelinek,2004,Literature,Austria,,"Elfriede Jelinek , Literature, 2004",20 October 1946,,Mürzzuschlag,,female
2,http://en.wikipedia.org/wiki/Eric_Kandel,Eric Kandel *,2000,Physiology or Medicine,,Austria,"Eric Kandel *, Physiology or Medicine, 2000",7 November 1929,,Vienna,,male
3,http://en.wikipedia.org/wiki/Walter_Kohn,Walter Kohn *,1998,Chemistry,,Austria,"Walter Kohn *, Chemistry, 1998",9 March 1923,19 April 2016,Vienna,Santa Barbara,male
4,http://en.wikipedia.org/wiki/Friedrich_Hayek,Friedrich Hayek,1974,Economics,Austria,,"Friedrich Hayek , Economics, 1974",8 May 1899,23 March 1992,Vienna,Freiburg im Breisgau,male


## Indices and Pandas Data Selection

In [8]:
df.columns

Index(['link', 'name', 'year', 'category', 'country', 'born_in', 'text',
       'date_of_birth', 'date_of_death', 'place_of_birth', 'place_of_death',
       'gender'],
      dtype='object')

In [9]:
df_name = df.set_index('name')

In [10]:
df_name.head(2)

Unnamed: 0_level_0,link,year,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Karl Landsteiner,http://en.wikipedia.org/wiki/Karl_Landsteiner,1930,Physiology or Medicine,Austria,,"Karl Landsteiner , Physiology or Medicine, 1930",14 June 1868,26 June 1943,Vienna,New York City,male
Elfriede Jelinek,http://en.wikipedia.org/wiki/Elfriede_Jelinek,2004,Literature,Austria,,"Elfriede Jelinek , Literature, 2004",20 October 1946,,Mürzzuschlag,,female


In [11]:
df_name.reset_index(inplace=True)

In [12]:
df.head(2)

Unnamed: 0,link,name,year,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender
0,http://en.wikipedia.org/wiki/Karl_Landsteiner,Karl Landsteiner,1930,Physiology or Medicine,Austria,,"Karl Landsteiner , Physiology or Medicine, 1930",14 June 1868,26 June 1943,Vienna,New York City,male
1,http://en.wikipedia.org/wiki/Elfriede_Jelinek,Elfriede Jelinek,2004,Literature,Austria,,"Elfriede Jelinek , Literature, 2004",20 October 1946,,Mürzzuschlag,,female


In [13]:
bi_col = df.born_in
bi_col

0             
1             
2      Austria
3      Austria
4             
        ...   
321           
322           
323           
324           
325           
Name: born_in, Length: 326, dtype: object

In [14]:
type(bi_col)

pandas.core.series.Series

In [15]:
df.iloc[0]

link                http://en.wikipedia.org/wiki/Karl_Landsteiner
name                                             Karl Landsteiner
year                                                         1930
category                                   Physiology or Medicine
country                                                   Austria
born_in                                                          
text              Karl Landsteiner , Physiology or Medicine, 1930
date_of_birth                                        14 June 1868
date_of_death                                        26 June 1943
place_of_birth                                             Vienna
place_of_death                                      New York City
gender                                                       male
Name: 0, dtype: object

### Selecting Multiple Rows

In [16]:
df[0:10]

Unnamed: 0,link,name,year,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender
0,http://en.wikipedia.org/wiki/Karl_Landsteiner,Karl Landsteiner,1930,Physiology or Medicine,Austria,,"Karl Landsteiner , Physiology or Medicine, 1930",14 June 1868,26 June 1943,Vienna,New York City,male
1,http://en.wikipedia.org/wiki/Elfriede_Jelinek,Elfriede Jelinek,2004,Literature,Austria,,"Elfriede Jelinek , Literature, 2004",20 October 1946,,Mürzzuschlag,,female
2,http://en.wikipedia.org/wiki/Eric_Kandel,Eric Kandel *,2000,Physiology or Medicine,,Austria,"Eric Kandel *, Physiology or Medicine, 2000",7 November 1929,,Vienna,,male
3,http://en.wikipedia.org/wiki/Walter_Kohn,Walter Kohn *,1998,Chemistry,,Austria,"Walter Kohn *, Chemistry, 1998",9 March 1923,19 April 2016,Vienna,Santa Barbara,male
4,http://en.wikipedia.org/wiki/Friedrich_Hayek,Friedrich Hayek,1974,Economics,Austria,,"Friedrich Hayek , Economics, 1974",8 May 1899,23 March 1992,Vienna,Freiburg im Breisgau,male
5,http://en.wikipedia.org/wiki/John_Cornforth,John Cornforth *,1975,Chemistry,,Australia,"John Cornforth *, Chemistry, 1975",7 September 1917,8 December 2013,Sydney,Brighton,male
6,http://en.wikipedia.org/wiki/Patrick_White,Patrick White,1973,Literature,Australia,,"Patrick White , born in the United Kingdom , ...",28 May 1912,30 September 1990,London,Sydney,male
7,http://en.wikipedia.org/wiki/John_Eccles_(neur...,John Carew Eccles,1963,Physiology or Medicine,Australia,,"John Carew Eccles , Physiology or Medicine, 1963",27 January 1903,2 May 1997,Melbourne,Locarno,male
8,http://en.wikipedia.org/wiki/Frank_Macfarlane_...,Sir Frank Macfarlane Burnet,1960,Physiology or Medicine,Australia,,"Sir Frank Macfarlane Burnet , Physiology or Me...",3 September 1899,31 August 1985,Traralgon,Melbourne,male
9,http://en.wikipedia.org/wiki/Gerhard_Herzberg,Gerhard Herzberg *,1971,Chemistry,,Germany,"Gerhard Herzberg *, Chemistry, 1971",25 December 1904,3 March 1999,Hamburg,Ottawa,male


In [17]:
df[-4:]

Unnamed: 0,link,name,year,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender
322,http://en.wikipedia.org/wiki/Adolfo_P%C3%A9rez...,Adolfo Pérez Esquivel,1980,Peace,Argentina,,"Adolfo Pérez Esquivel , Peace, 1980",26 November 1931,,Buenos Aires,,male
323,http://en.wikipedia.org/wiki/Luis_Federico_Leloir,Luis Federico Leloir,1970,Chemistry,Argentina,,"Luis Federico Leloir , Chemistry, 1970",6 September 1906,2 December 1987,Paris,Catamarca Province,male
324,http://en.wikipedia.org/wiki/Bernardo_Houssay,Bernardo Houssay,1947,Physiology or Medicine,Argentina,,"Bernardo Houssay , Physiology or Medicine, 1947",10 April 1887,21 September 1971,Buenos Aires,Buenos Aires,male
325,http://en.wikipedia.org/wiki/Carlos_Saavedra_L...,Carlos Saavedra Lamas,1936,Peace,Argentina,,"Carlos Saavedra Lamas , Peace, 1936",1 November 1878,5 May 1959,Buenos Aires,Buenos Aires,male


In [18]:
mask = df.year > 2000
mask

0      False
1       True
2      False
3      False
4      False
       ...  
321    False
322    False
323    False
324    False
325    False
Name: year, Length: 326, dtype: bool

In [19]:
winners_since_2000 = df[mask]
winners_since_2000.count()

link              53
name              53
year              53
category          53
country           53
born_in           53
text              53
date_of_birth     51
date_of_death     10
place_of_birth    51
place_of_death    10
gender            51
dtype: int64

In [20]:
winners_since_2000 = df[df.year > 2000]

## Cleaning the Data

In [21]:
df.born_in.describe()

count     326
unique     18
top          
freq      261
Name: born_in, dtype: object

### Finding Mixed Types

In [22]:
set(df.born_in.apply(type))

{str}

### Replacing Strings

In [23]:
import numpy as np

In [24]:
bio_col = df.born_in

In [25]:
bio_col.replace('', np.nan, inplace=True)
bio_col

0          NaN
1          NaN
2      Austria
3      Austria
4          NaN
        ...   
321        NaN
322        NaN
323        NaN
324        NaN
325        NaN
Name: born_in, Length: 326, dtype: object

In [26]:
bio_col.count()

65

In [27]:
df.replace('', np.nan, inplace=True)
df.head()

Unnamed: 0,link,name,year,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender
0,http://en.wikipedia.org/wiki/Karl_Landsteiner,Karl Landsteiner,1930,Physiology or Medicine,Austria,,"Karl Landsteiner , Physiology or Medicine, 1930",14 June 1868,26 June 1943,Vienna,New York City,male
1,http://en.wikipedia.org/wiki/Elfriede_Jelinek,Elfriede Jelinek,2004,Literature,Austria,,"Elfriede Jelinek , Literature, 2004",20 October 1946,,Mürzzuschlag,,female
2,http://en.wikipedia.org/wiki/Eric_Kandel,Eric Kandel *,2000,Physiology or Medicine,,Austria,"Eric Kandel *, Physiology or Medicine, 2000",7 November 1929,,Vienna,,male
3,http://en.wikipedia.org/wiki/Walter_Kohn,Walter Kohn *,1998,Chemistry,,Austria,"Walter Kohn *, Chemistry, 1998",9 March 1923,19 April 2016,Vienna,Santa Barbara,male
4,http://en.wikipedia.org/wiki/Friedrich_Hayek,Friedrich Hayek,1974,Economics,Austria,,"Friedrich Hayek , Economics, 1974",8 May 1899,23 March 1992,Vienna,Freiburg im Breisgau,male


In [28]:
df[df.name.str.contains('\*')]['name']

2              Eric Kandel *
3              Walter Kohn *
5           John Cornforth *
9         Gerhard Herzberg *
11            Bernard Katz *
               ...          
306             Ivo Andrić *
307        Vladimir Prelog *
308          Peter Medawar *
309          Elias Canetti *
315    Elizabeth Blackburn *
Name: name, Length: 65, dtype: object

In [29]:
df.name = df.name.str.replace('*', '')
df.name = df.name.str.strip()

In [30]:
df[df.name.str.contains('\*')]

Unnamed: 0,link,name,year,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender


### Removing Rows

In [31]:
np.nan == np.nan

False

In [32]:
df[df.born_in.notnull()]

Unnamed: 0,link,name,year,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender
2,http://en.wikipedia.org/wiki/Eric_Kandel,Eric Kandel,2000,Physiology or Medicine,,Austria,"Eric Kandel *, Physiology or Medicine, 2000",7 November 1929,,Vienna,,male
3,http://en.wikipedia.org/wiki/Walter_Kohn,Walter Kohn,1998,Chemistry,,Austria,"Walter Kohn *, Chemistry, 1998",9 March 1923,19 April 2016,Vienna,Santa Barbara,male
5,http://en.wikipedia.org/wiki/John_Cornforth,John Cornforth,1975,Chemistry,,Australia,"John Cornforth *, Chemistry, 1975",7 September 1917,8 December 2013,Sydney,Brighton,male
9,http://en.wikipedia.org/wiki/Gerhard_Herzberg,Gerhard Herzberg,1971,Chemistry,,Germany,"Gerhard Herzberg *, Chemistry, 1971",25 December 1904,3 March 1999,Hamburg,Ottawa,male
11,http://en.wikipedia.org/wiki/Bernard_Katz,Bernard Katz,1970,Physiology or Medicine,,Germany,"Bernard Katz *, Physiology or Medicine, 1970",26 March 1911,20 April 2003,Leipzig,London,male
...,...,...,...,...,...,...,...,...,...,...,...,...
306,http://en.wikipedia.org/wiki/Ivo_Andri%C4%87,Ivo Andrić,1961,Literature,,Bosnia and Herzegovina,"Ivo Andrić *, Ethnic Croat and national born...",9 October 1892,13 March 1975,Dolac,Belgrade,male
307,http://en.wikipedia.org/wiki/Vladimir_Prelog,Vladimir Prelog,1975,Chemistry,,Bosnia and Herzegovina,"Vladimir Prelog *, Ethnic Croat and national ...",23 July 1906,7 January 1998,Sarajevo,Zürich,male
308,http://en.wikipedia.org/wiki/Peter_Medawar,Peter Medawar,1960,Physiology or Medicine,,Brazil,"Peter Medawar *, Physiology or Medicine, 1960",28 February 1915,2 October 1987,Rio de Janeiro,London,male
309,http://en.wikipedia.org/wiki/Elias_Canetti,Elias Canetti,1981,Literature,,Bulgaria,"Elias Canetti *, Literature, 1981",25 July 1905,14 August 1994,Ruse,Zürich,male


### Finding Duplicates

In [33]:
dupes_by_name = df[df.duplicated('name')]
dupes_by_name.count()

link              17
name              17
year              17
category          17
country            9
born_in            8
text              17
date_of_birth     17
date_of_death     13
place_of_birth    17
place_of_death    13
gender            17
dtype: int64

In [34]:
all_dupes = df[df.duplicated('name') | df.duplicated('name', keep='last')]
all_dupes.count()

link              33
name              33
year              33
category          33
country           17
born_in           16
text              33
date_of_birth     33
date_of_death     26
place_of_birth    33
place_of_death    26
gender            33
dtype: int64

In [35]:
all_dupes = df[df.name.isin(dupes_by_name.name)]
all_dupes.count()

link              33
name              33
year              33
category          33
country           17
born_in           16
text              33
date_of_birth     33
date_of_death     26
place_of_birth    33
place_of_death    26
gender            33
dtype: int64

In [36]:
for name, rows in df.groupby('name'):
    print(f'name: {name}, number of rows: {len(rows)}')

name: Aage Bohr, number of rows: 1
name: Adolf Butenandt, number of rows: 1
name: Adolf Otto Reinhold Windaus, number of rows: 1
name: Adolf von Baeyer, number of rows: 1
name: Adolfo Pérez Esquivel, number of rows: 1
name: Ahmed Zewail, number of rows: 1
name: Albert Camus, number of rows: 1
name: Albert Claude, number of rows: 1
name: Albert Einstein, number of rows: 1
name: Albert Fert, number of rows: 1
name: Albert Schweitzer, number of rows: 2
name: Albert Szent-Györgyi, number of rows: 1
name: Albrecht Kossel, number of rows: 1
name: Alexis Carrel, number of rows: 1
name: Alfred Hermann Fried, number of rows: 1
name: Alfred Kastler, number of rows: 1
name: Alfred Werner, number of rows: 1
name: Alice Munro, number of rows: 1
name: Anatole France, number of rows: 1
name: André Frédéric Cournand, number of rows: 1
name: André Gide, number of rows: 1
name: André Lwoff, number of rows: 1
name: Antoine Henri Becquerel, number of rows: 1
name: Anwar Sadat, number of rows: 1
name: Aris

In [37]:
pd.concat([g for _, g in df.groupby('name') if len(g) > 1])['name']

64           Albert Schweitzer
185          Albert Schweitzer
231         Bertha von Suttner
321         Bertha von Suttner
21              Charles K. Kao
267             Charles K. Kao
253             Daniel C. Tsui
286             Daniel C. Tsui
203               Gao Xingjian
252               Gao Xingjian
9             Gerhard Herzberg
279           Gerhard Herzberg
48                John Polanyi
101               John Polanyi
275               John Polanyi
110            Karl von Frisch
167            Karl von Frisch
189       Luis Federico Leloir
323       Luis Federico Leloir
143                Marie Curie
160                Marie Curie
217        Niels Ryberg Finsen
228        Niels Ryberg Finsen
74                  Otto Loewi
192                 Otto Loewi
124             Peter Grünberg
247             Peter Grünberg
54     Richard Adolf Zsigmondy
245    Richard Adolf Zsigmondy
73                Richard Kuhn
191               Richard Kuhn
259            Vladimir Prelog
307     

### Sorting Data

In [38]:
df2 = pd.DataFrame({
    'name': ['zak', 'alice', 'bob', 'mike', 'bob', 'bob'],
    'score': [4, 3, 5, 2, 3, 7]
})

df2.sort_values(['name', 'score'], ascending=[1, 0])

Unnamed: 0,name,score
1,alice,3
5,bob,7
2,bob,5
4,bob,3
3,mike,2
0,zak,4


In [39]:
all_dupes.sort_values('name')[['name', 'country', 'year']]

Unnamed: 0,name,country,year
64,Albert Schweitzer,,1952
185,Albert Schweitzer,France,1952
321,Bertha von Suttner,Austria,1905
231,Bertha von Suttner,,1905
21,Charles K. Kao,Hong Kong,2009
267,Charles K. Kao,,2009
286,Daniel C. Tsui,,1998
253,Daniel C. Tsui,,1998
252,Gao Xingjian,,2000
203,Gao Xingjian,France,2000


In [40]:
df[df.name.str.contains('Marie')]

Unnamed: 0,link,name,year,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender
143,http://en.wikipedia.org/wiki/Marie_Curie,Marie Curie,1911,Chemistry,France,,"Marie Curie , born in partitioned Poland (Rus...",7 November 1867,4 July 1934,Warsaw,Sancellemoz,female
160,http://en.wikipedia.org/wiki/Marie_Curie,Marie Curie,1903,Physics,France,,"Marie Curie , born in partitioned Poland (Rus...",7 November 1867,4 July 1934,Warsaw,Sancellemoz,female
210,http://en.wikipedia.org/wiki/Jean-Marie_Lehn,Jean-Marie Lehn,1987,Chemistry,France,,"Jean-Marie Lehn , Chemistry, 1987",30 September 1939,,Rosheim,,male


In [41]:
df[df.name == 'Sidney Altman']

Unnamed: 0,link,name,year,category,country,born_in,text,date_of_birth,date_of_death,place_of_birth,place_of_death,gender
274,http://en.wikipedia.org/wiki/Sidney_Altman,Sidney Altman,1989,Chemistry,,Canada,"Sidney Altman *, Chemistry, 1989",7 May 1939,,Montreal,,male


### Dealing with Missing Fields

In [42]:
df[df.category.isnull()][['name', 'text']]

Unnamed: 0,name,text


In [43]:
df[df.gender.isnull()]['name']

165                  International Atomic Energy Agency
205                            Médecins Sans Frontières
269    Pugwash Conferences on Science and World Affairs
303                                        Grameen Bank
305                     Institut de Droit International
Name: name, dtype: object

In [44]:
df.count()

link              326
name              326
year              326
category          326
country           261
born_in            65
text              326
date_of_birth     321
date_of_death     240
place_of_birth    321
place_of_death    240
gender            321
dtype: int64

In [45]:
df = df[df.gender.notnull()]

In [46]:
df.count()

link              321
name              321
year              321
category          321
country           256
born_in            65
text              321
date_of_birth     321
date_of_death     240
place_of_birth    321
place_of_death    240
gender            321
dtype: int64

### Dealing with Times and Dates

In [47]:
df[['name', 'date_of_birth']]

Unnamed: 0,name,date_of_birth
0,Karl Landsteiner,14 June 1868
1,Elfriede Jelinek,20 October 1946
2,Eric Kandel,7 November 1929
3,Walter Kohn,9 March 1923
4,Friedrich Hayek,8 May 1899
...,...,...
321,Bertha von Suttner,9 June 1843
322,Adolfo Pérez Esquivel,26 November 1931
323,Luis Federico Leloir,6 September 1906
324,Bernardo Houssay,10 April 1887


In [48]:
pd.to_datetime(df.date_of_birth, errors='raise')

0     1868-06-14
1     1946-10-20
2     1929-11-07
3     1923-03-09
4     1899-05-08
         ...    
321   1843-06-09
322   1931-11-26
323   1906-09-06
324   1887-04-10
325   1878-11-01
Name: date_of_birth, Length: 321, dtype: datetime64[ns]

In [49]:
with_death_dates = df[df.date_of_death.notnull()]
bad_dates = pd.isnull(pd.to_datetime(
            with_death_dates.date_of_death, errors='coerce'))

In [50]:
with_death_dates[bad_dates][['category', 'date_of_death']]

Unnamed: 0,category,date_of_death


In [51]:
# df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce')

In [52]:
df['dob'] = pd.to_datetime(df.date_of_birth, errors='raise')

In [53]:
df['award_age'] = df.year - pd.DatetimeIndex(df.dob).year

In [54]:
df.sort_values('award_age').iloc[:10][['name', 'award_age', 'category', 'year', 'date_of_birth', 'dob', 'date_of_death']]

Unnamed: 0,name,award_age,category,year,date_of_birth,dob,date_of_death
220,Ragnar Granit,-91,Physiology or Medicine,1809,30 October 1900,1900-10-30,12 March 1991
221,Artturi Ilmari Virtanen,-86,Chemistry,1809,15 January 1895,1895-01-15,11 November 1973
222,Frans Eemil Sillanpää,-79,Literature,1809,16 September 1888,1888-09-16,3 June 1964
122,William Lawrence Bragg,25,Physics,1915,31 March 1890,1890-03-31,1 July 1971
56,Werner Karl Heisenberg,31,Physics,1932,5 December 1901,1901-12-05,1 February 1976
262,Tsung-Dao Lee,31,Physics,1957,24 November 1926,1926-11-24,
89,Rudolf Mössbauer,32,Physics,1961,31 January 1929,1929-01-31,14 September 2011
283,Frederick G. Banting,32,Physiology or Medicine,1923,14 November 1891,1891-11-14,21 February 1941
18,Rigoberta Menchú,33,Peace,1992,9 January 1959,1959-01-09,
172,Frédéric Joliot-Curie,35,Chemistry,1935,19 March 1900,1900-03-19,14 August 1958


In [55]:
df[['name', 'award_age', 'category', 'year', 'date_of_birth', 'dob', 'date_of_death']]

Unnamed: 0,name,award_age,category,year,date_of_birth,dob,date_of_death
0,Karl Landsteiner,62,Physiology or Medicine,1930,14 June 1868,1868-06-14,26 June 1943
1,Elfriede Jelinek,58,Literature,2004,20 October 1946,1946-10-20,
2,Eric Kandel,71,Physiology or Medicine,2000,7 November 1929,1929-11-07,
3,Walter Kohn,75,Chemistry,1998,9 March 1923,1923-03-09,19 April 2016
4,Friedrich Hayek,75,Economics,1974,8 May 1899,1899-05-08,23 March 1992
...,...,...,...,...,...,...,...
321,Bertha von Suttner,62,Peace,1905,9 June 1843,1843-06-09,21 June 1914
322,Adolfo Pérez Esquivel,49,Peace,1980,26 November 1931,1931-11-26,
323,Luis Federico Leloir,64,Chemistry,1970,6 September 1906,1906-09-06,2 December 1987
324,Bernardo Houssay,60,Physiology or Medicine,1947,10 April 1887,1887-04-10,21 September 1971


In [56]:
df.to_json('cleaned_nobel.json')

In [57]:
df.columns

Index(['link', 'name', 'year', 'category', 'country', 'born_in', 'text',
       'date_of_birth', 'date_of_death', 'place_of_birth', 'place_of_death',
       'gender', 'dob', 'award_age'],
      dtype='object')