In [50]:
import pandas as pd
import numpy as np
import pymongo

# Functions

As a side note, here's how to delete the Mongo database, using the mongo shell:

    (from a command prompt) mongo shell
    use nobel_prize
    db.dropDatabase()

In [65]:
def get_mongo_database(db_name, host='localhost', 
                       port=27017, username=None, password=None):
    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s/%s'%(username, password, host, db_name)
        conn = pymongo.MongoClient(mongo_uri)
    else:
        conn = pymongo.MongoClient(host, port)
        
    return conn[db_name]

def mongo_to_dataframe(db_name, collection, query={},
                       host='localhost', port=27017,
                       username=None, password=None, no_id=True):
    db = get_mongo_database(db_name, host, port, username, password)
    
    cursor = db[collection].find(query)
    df = pd.DataFrame(list(cursor))
    
    if no_id:
        del df['_id']
    
    return df

def dataframe_to_mongo(df, db_name, collection,
                       host='localhost', port=27017,
                       username=None, password=None):
    db = get_mongo_database(db_name, host, port, username, password)
    records = df.to_dict('records')
    db[collection].insert_many(records)

In [66]:
MONGO_DB_NAME = 'nobel_prize'
MONGO_COLL_NAME_WINNERS = 'winners'

# Load from .json into Mongo

In [67]:
# at least once, let's load the contents of the json we ended up with
# into Mongo, and then, I guess, we'll use the data from Mongo
# good for practice, I suppose
df = pd.read_json('nobel_winners_scrapy/nobel_winners.json')
df.shape

(1106, 12)

In [68]:
df[:2]

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physics,Australia,24 February 1967,,male,https://en.wikipedia.org/wiki/Brian_Schmidt,Brian Schmidt,Missoula,,"Brian Schmidt , born in the United States , P...",2011
1,Australia,Physiology or Medicine,,26 November 1948,,female,https://en.wikipedia.org/wiki/Elizabeth_Blackburn,Elizabeth Blackburn *,Hobart,,"Elizabeth Blackburn *, Physiology or Medicine,...",2009


In [69]:
df.columns

Index(['born_in', 'category', 'country', 'date_of_birth', 'date_of_death',
       'gender', 'link', 'name', 'place_of_birth', 'place_of_death', 'text',
       'year'],
      dtype='object')

In [70]:
dataframe_to_mongo(df, MONGO_DB_NAME, MONGO_COLL_NAME_WINNERS)

# Load winners data from Mongo

In [71]:
# and now we'll get the df from Mongo
df = mongo_to_dataframe(MONGO_DB_NAME, MONGO_COLL_NAME_WINNERS, no_id=True)
df.shape

(1106, 12)

In [72]:
df.columns

Index(['born_in', 'category', 'country', 'date_of_birth', 'date_of_death',
       'gender', 'link', 'name', 'place_of_birth', 'place_of_death', 'text',
       'year'],
      dtype='object')

In [73]:
df[:2]

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physics,Australia,24 February 1967,,male,https://en.wikipedia.org/wiki/Brian_Schmidt,Brian Schmidt,Missoula,,"Brian Schmidt , born in the United States , P...",2011
1,Australia,Physiology or Medicine,,26 November 1948,,female,https://en.wikipedia.org/wiki/Elizabeth_Blackburn,Elizabeth Blackburn *,Hobart,,"Elizabeth Blackburn *, Physiology or Medicine,...",2009


# Initial data inspection

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1106 entries, 0 to 1105
Data columns (total 12 columns):
born_in           1106 non-null object
category          1106 non-null object
country           1106 non-null object
date_of_birth     1097 non-null object
date_of_death     725 non-null object
gender            1097 non-null object
link              1106 non-null object
name              1106 non-null object
place_of_birth    1097 non-null object
place_of_death    725 non-null object
text              1106 non-null object
year              1106 non-null int64
dtypes: int64(1), object(11)
memory usage: 103.8+ KB


In [75]:
df.describe()

Unnamed: 0,year
count,1106.0
mean,1968.169078
std,68.065801
min,0.0
25%,1948.0
50%,1976.0
75%,1998.0
max,2016.0


In [76]:
df.describe(include=['object'])

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text
count,1106.0,1106,1106,1097,725,1097,1106,1106,1097,725,1106
unique,40.0,7,64,868,580,2,913,1024,605,313,1089
top,,Physiology or Medicine,United States,15 March 1930,9 March 1992,male,https://en.wikipedia.org/wiki/Marie_Curie,César Milstein,New York City,Cambridge,"Michael Levitt *, as an Israeli citizen , C..."
freq,943.0,262,351,4,4,1036,4,3,47,37,3


In [77]:
df.head()

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physics,Australia,24 February 1967,,male,https://en.wikipedia.org/wiki/Brian_Schmidt,Brian Schmidt,Missoula,,"Brian Schmidt , born in the United States , P...",2011
1,Australia,Physiology or Medicine,,26 November 1948,,female,https://en.wikipedia.org/wiki/Elizabeth_Blackburn,Elizabeth Blackburn *,Hobart,,"Elizabeth Blackburn *, Physiology or Medicine,...",2009
2,,Chemistry,Germany,26 November 1898,12 August 1973,male,https://en.wikipedia.org/wiki/Karl_Ziegler,Karl Ziegler,Helsa,Mülheim,"Karl Ziegler , Chemistry, 1963",1963
3,Germany,Physics,,28 June 1906,20 February 1972,female,https://en.wikipedia.org/wiki/Maria_Goeppert-M...,Maria Goeppert-Mayer *,Katowice,San Diego,"Maria Goeppert-Mayer *, Physics, 1963",1963
4,,Physics,Germany,25 June 1907,11 February 1973,male,https://en.wikipedia.org/wiki/J._Hans_D._Jensen,J. Hans D. Jensen,Hamburg,Heidelberg,"J. Hans D. Jensen , Physics, 1963",1963


In [78]:
df['born_in']

0                             
1                    Australia
2                             
3                      Germany
4                             
5                             
6                             
7                      Germany
8                             
9                      Germany
10                            
11                            
12                            
13                            
14                            
15                      Poland
16                      Poland
17                      Poland
18                            
19                      Poland
20                 Netherlands
21                            
22                            
23                            
24                            
25                            
26                            
27                South Africa
28                            
29                            
                 ...          
1076                          
1077    

In [79]:
df.born_in.describe()

count     1106
unique      40
top           
freq       943
Name: born_in, dtype: object

In [80]:
set(df['born_in'].apply(type))

{str}

In [81]:
df['born_in'].replace('', np.nan, inplace=True)

In [82]:
df['born_in'].describe()

count         163
unique         39
top       Germany
freq           24
Name: born_in, dtype: object

And we can use replace generally w/ the dataframe to replace all empty strings with NaN.

In [83]:
df.replace('', np.nan, inplace=True)

In [84]:
df.describe(include=['object'])

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text
count,163,1102,943,1097,725,1097,1106,1106,1097,725,1106
unique,39,6,63,868,580,2,913,1024,605,313,1089
top,Germany,Physiology or Medicine,United States,15 March 1930,9 March 1992,male,https://en.wikipedia.org/wiki/Marie_Curie,César Milstein,New York City,Cambridge,"Michael Levitt *, as an Israeli citizen , C..."
freq,24,262,351,4,4,1036,4,3,47,37,3


Asterisks in the name field mean that these winners are recorded by country of birth, not by country at the time of winning the prize. We want to remove the asterisks and any additional whitespace.

In [86]:
# how many?
#df[df.name.str.contains('\*')]['name']
df[df['name'].str.contains('\*')]['name']

1              Elizabeth Blackburn *
3             Maria Goeppert-Mayer *
7                 Jack Steinberger *
9                     John Polanyi *
15                  Joseph Rotblat *
16                    Shimon Peres *
17                 Georges Charpak *
19                  Roald Hoffmann *
20            Nicolaas Bloembergen *
27                Allan M. Cormack *
100                    T. S. Eliot *
161             Har Gobind Khorana *
187               Ben R. Mottelson *
224                 Czesław Miłosz *
245     Subrahmanyan Chandrasekhar *
332                Daniel Kahneman *
336                  Robert Aumann *
346                Rudyard Kipling *
351                    Ronald Ross *
357       Venkatraman Ramakrishnan *
389                 Shuji Nakamura *
390                   John O'Keefe *
394                  Arieh Warshel *
395                 Michael Levitt *
416                 Michael Levitt *
419     John James Rickard Macleod *
452                Oliver Smithies *
4

In [87]:
df['name'] = df['name'].str.replace('*','')
df['name'] = df['name'].str.strip()

In [88]:
df[df['name'].str.contains('\*')]

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year


The winner rows that have values for the born_in field are duplicates, because they have an entry in Wikipedia both for the county in which they were born and their country at the time they were awarded the prize. We only want a single row per winner, so we'll only take the winner rows that don't have a born_in value.

In [89]:
df = df[df['born_in'].isnull()]
df = df.drop('born_in', axis=1)
df[:3]

Unnamed: 0,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,Physics,Australia,24 February 1967,,male,https://en.wikipedia.org/wiki/Brian_Schmidt,Brian Schmidt,Missoula,,"Brian Schmidt , born in the United States , P...",2011
2,Chemistry,Germany,26 November 1898,12 August 1973,male,https://en.wikipedia.org/wiki/Karl_Ziegler,Karl Ziegler,Helsa,Mülheim,"Karl Ziegler , Chemistry, 1963",1963
4,Physics,Germany,25 June 1907,11 February 1973,male,https://en.wikipedia.org/wiki/J._Hans_D._Jensen,J. Hans D. Jensen,Hamburg,Heidelberg,"J. Hans D. Jensen , Physics, 1963",1963


In [91]:
df.shape

(943, 11)

We think we still have some dupes, because a separate search online shows that 889 folks had received a Nobel Prize through 2014.

The 'duplicated' method takes a field/column and returns False for the first occurence of any rows with the same values for the particular field and True for any subsequent occurence. (The call has the 'keep' param to change this.)

In [94]:
dupes_by_name = df[df.duplicated('name')]
dupes_by_name.shape

(61, 11)

A few people _have_ won multiple prizes, but not 61.

The book says one place to look at is whether multiple countries are 'claiming' the same winner... since the original list is by country, all this would take, I think, is for the same person to be listed more than once/for more than one country.

A few different ways to get all duplicates - since, by default, duplicated only returns True for duplicates after the first instance...

In [99]:
# easiest, not in the book (perhaps because this option is new?)
# passing False for 'keep' tells duplicated to mark all duplicates as True
all_dupes_1 = df[df.duplicated('name', keep=False)]
all_dupes_1.shape

(121, 11)

In [100]:
# or we can make our own instance of the previous by ORing together the
# result of keep='first' and keep='last'
all_dupes_2 = df[df.duplicated('name', keep='first') | 
                 df.duplicated('name', keep='last')]
all_dupes_2.shape

(121, 11)

In [102]:
# or we can see if the rows have a name that's in the list of duplicate
# names
all_dupes_3 = df[df['name'].isin(dupes_by_name['name'])]
all_dupes_3.shape

(121, 11)

In [103]:
# use groupby (you can do pretty much anything w/ groupby, it seems)
for name, rows in df.groupby('name'):
    print('name: %s, number of rows: %d'%(name, len(rows)))

name: 14th Dalai Lama, number of rows: 1
name: A. Michael Spence, number of rows: 1
name: Aage Bohr, number of rows: 1
name: Aaron Ciechanover, number of rows: 1
name: Aaron Klug, number of rows: 2
name: Abdus Salam, number of rows: 1
name: Ada Yonath, number of rows: 1
name: Adam G. Riess, number of rows: 1
name: Adolf Butenandt, number of rows: 1
name: Adolf Otto Reinhold Windaus, number of rows: 1
name: Adolf von Baeyer, number of rows: 1
name: Adolfo Pérez Esquivel, number of rows: 2
name: Ahmed H. Zewail, number of rows: 1
name: Ahmed Zewail, number of rows: 1
name: Akira Suzuki, number of rows: 1
name: Al Gore, number of rows: 1
name: Alan Heeger, number of rows: 1
name: Alan Lloyd Hodgkin, number of rows: 1
name: Alan MacDiarmid, number of rows: 2
name: Albert A. Michelson, number of rows: 1
name: Albert Camus, number of rows: 1
name: Albert Claude, number of rows: 1
name: Albert Einstein, number of rows: 2
name: Albert Fert, number of rows: 1
name: Albert Lutuli, number of rows

In [104]:
all_dupes_4 = pd.concat([g for _,g in df.groupby('name') if len(g)>1])['name']
all_dupes_4.shape

(121,)

In [105]:
all_dupes_4[:2]

26     Aaron Klug
496    Aaron Klug
Name: name, dtype: object

In [106]:
all_dupes = all_dupes_1

Ok, so now lets use the results of the full list of duplicates to see where these duplicates come from...

In [108]:
all_dupes.sort_values('name')[['name','country','year']]

Unnamed: 0,name,country,year
26,Aaron Klug,South Africa,1982
496,Aaron Klug,United Kingdom,1982
993,Adolfo Pérez Esquivel,Argentina,1980
988,Adolfo Pérez Esquivel,Summary,1980
635,Alan MacDiarmid,New Zealand,2000
309,Alan MacDiarmid,United States,2000
849,Albert Einstein,Germany,1921
549,Albert Einstein,Switzerland,1921
397,Angus Deaton,United States,2015
413,Angus Deaton,United Kingdom,2015


In [None]:
#TODO start up w/ removing duplicates on p240