In [80]:
import pandas as pd
import json
import pymongo
import numpy as np

In [56]:
df = pd.read_json('nobel_winners_scrapy/nobel_winners.json', encoding='utf-8')
df.shape

(1106, 12)

In [57]:
df[:3]

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physics,Australia,24 February 1967,,male,https://en.wikipedia.org/wiki/Brian_Schmidt,Brian Schmidt,Missoula,,"Brian Schmidt , born in the United States , P...",2011
1,Australia,Physiology or Medicine,,26 November 1948,,female,https://en.wikipedia.org/wiki/Elizabeth_Blackburn,Elizabeth Blackburn *,Hobart,,"Elizabeth Blackburn *, Physiology or Medicine,...",2009
2,,Chemistry,Germany,26 November 1898,12 August 1973,male,https://en.wikipedia.org/wiki/Karl_Ziegler,Karl Ziegler,Helsa,Mülheim,"Karl Ziegler , Chemistry, 1963",1963


In [58]:
df['gender'].value_counts(dropna=False)

male      1036
female      61
NaN          9
Name: gender, dtype: int64

Since we're not getting rid of non-person prizes in the Scrapy code, we'll do it here.

In [59]:
df = df[df['gender'].notnull()]
df['gender'].value_counts(dropna=False)

male      1036
female      61
Name: gender, dtype: int64

In [60]:
# loc works by name/by index, but we can pass two here because the default
# index IS just integers
df.loc[2]

born_in                                                     
category                                           Chemistry
country                                              Germany
date_of_birth                               26 November 1898
date_of_death                                 12 August 1973
gender                                                  male
link              https://en.wikipedia.org/wiki/Karl_Ziegler
name                                            Karl Ziegler
place_of_birth                                         Helsa
place_of_death                                       Mülheim
text                          Karl Ziegler , Chemistry, 1963
year                                                    1963
Name: 2, dtype: object

In [61]:
# iloc is the position, so 2 is the third row
df.iloc[2]

born_in                                                     
category                                           Chemistry
country                                              Germany
date_of_birth                               26 November 1898
date_of_death                                 12 August 1973
gender                                                  male
link              https://en.wikipedia.org/wiki/Karl_Ziegler
name                                            Karl Ziegler
place_of_birth                                         Helsa
place_of_death                                       Mülheim
text                          Karl Ziegler , Chemistry, 1963
year                                                    1963
Name: 2, dtype: object

In [62]:
foo = df.groupby('category')
foo.groups.keys()

dict_keys(['', 'Economics', 'Literature', 'Chemistry', 'Peace', 'Physiology or Medicine', 'Physics'])

In [64]:
df[df.category == 'Physics']

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physics,Australia,24 February 1967,,male,https://en.wikipedia.org/wiki/Brian_Schmidt,Brian Schmidt,Missoula,,"Brian Schmidt , born in the United States , P...",2011
3,Germany,Physics,,28 June 1906,20 February 1972,female,https://en.wikipedia.org/wiki/Maria_Goeppert-M...,Maria Goeppert-Mayer *,Katowice,San Diego,"Maria Goeppert-Mayer *, Physics, 1963",1963
4,,Physics,Germany,25 June 1907,11 February 1973,male,https://en.wikipedia.org/wiki/J._Hans_D._Jensen,J. Hans D. Jensen,Hamburg,Heidelberg,"J. Hans D. Jensen , Physics, 1963",1963
5,,Physics,Germany,31 January 1929,14 September 2011,male,https://en.wikipedia.org/wiki/Rudolf_M%C3%B6ss...,Rudolf Mössbauer,Munich,Grünwald,"Rudolf Mössbauer , Physics, 1961",1961
7,Germany,Physics,,25 May 1921,,male,https://en.wikipedia.org/wiki/Jack_Steinberger,Jack Steinberger *,Bad Kissingen,,"Jack Steinberger *, Physics, 1988",1988
8,,Physics,Germany,16 May 1950,,male,https://en.wikipedia.org/wiki/J._Georg_Bednorz,J. Georg Bednorz,Neuenkirchen,,"J. Georg Bednorz , Physics, 1987",1987
17,Poland,Physics,,1 August 1924,29 September 2010,male,https://en.wikipedia.org/wiki/Georges_Charpak,Georges Charpak *,Dubrovytsia,Paris,"Georges Charpak *, born in then Poland (Secon...",1992
20,Netherlands,Physics,,11 March 1920,,male,https://en.wikipedia.org/wiki/Nicolaas_Bloembe...,Nicolaas Bloembergen *,Dordrecht,,"Nicolaas Bloembergen *, Physics, 1981",1981
30,,Physics,United States,20 February 1945,,male,https://en.wikipedia.org/wiki/George_F._Smoot,George F. Smoot,Yukon,,"George F. Smoot , Physics, 2006",2006
32,,Physics,United States,24 February 1967,,male,https://en.wikipedia.org/wiki/Brian_P._Schmidt,Brian P. Schmidt,Missoula,,"Brian P. Schmidt , Physics, 2011",2011


# Creating and saving data frames

In [67]:
df = pd.DataFrame({
        'name':['Albert Einstein','Marie Curie','William Faulkner'],
        'category':['Physics','Chemistry','Literature']
    })
df

Unnamed: 0,category,name
0,Physics,Albert Einstein
1,Chemistry,Marie Curie
2,Literature,William Faulkner


In [68]:
df.to_json()

'{"category":{"0":"Physics","1":"Chemistry","2":"Literature"},"name":{"0":"Albert Einstein","1":"Marie Curie","2":"William Faulkner"}}'

In [69]:
df.to_json(orient='records')

'[{"category":"Physics","name":"Albert Einstein"},{"category":"Chemistry","name":"Marie Curie"},{"category":"Literature","name":"William Faulkner"}]'

# MongoDB to/from

In [71]:
def get_mongo_database(db_name, host='localhost', 
                       port=27017, username=None, password=None):
    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s/%s'%(username, password, host, db_name)
        conn = pymongo.MongoClient(mongo_uri)
    else:
        conn = pymongo.MongoClient(host, port)
        
    return conn[db_name]

In [72]:
def mongo_to_dataframe(db_name, collection, query={},
                       host='localhost', port=27017,
                       username=None, password=None, no_id=True):
    db = get_mongo_database(db_name, host, port, username, password)
    
    cursor = db[collection].find(query)
    df = pd.DataFrame(list(cursor))
    
    if no_id:
        del df['_id']
    
    return df

def dataframe_to_mongo(df, db_name, collection,
                       host='localhost', port=27017,
                       username=None, password=None):
    db = get_mongo_database(db_name, host, port, username, password)
    records = df.to_dict('records')
    db[collection].insert_many(records)

In [76]:
winners = mongo_to_dataframe('nobel_prize','winners')
winners.shape

(3, 5)

In [77]:
winners

Unnamed: 0,category,name,nationality,sex,year
0,Physics,Albert Einstein,Swiss,male,1921
1,Physics,Paul Dirac,British,male,1933
2,Chemistry,Marie Curie,Polish,female,1911


# A bit w/ Series

Series can be passed, without any additional work, to any function that takes a NumPy ndarray.

In [78]:
s = pd.Series([1,2,3,4], ['a','b','c','d'])
s

a    1
b    2
c    3
d    4
dtype: int64

In [82]:
np.sqrt(s)

a    1.000000
b    1.414214
c    1.732051
d    2.000000
dtype: float64

In [83]:
np.power(s, 2)

a     1
b     4
c     9
d    16
dtype: int64

# Panels

Just like DataFrames are made of Series instances, Panels are made of DataFrame instances... they give a third dimension.

In [84]:
df1 = pd.DataFrame({'foo':[1,2,3], 'bar':['a','b','c']})
df1

Unnamed: 0,bar,foo
0,a,1
1,b,2
2,c,3


In [85]:
df2 = pd.DataFrame({'baz':[7,8,9,11], 'sky':['p','q','r','t']})
df2

Unnamed: 0,baz,sky
0,7,p
1,8,q
2,9,r
3,11,t


In [86]:
pn = pd.Panel({'item1':df1, 'item2':df2})
pn

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 4 (minor_axis)
Items axis: item1 to item2
Major_axis axis: 0 to 3
Minor_axis axis: bar to sky

In [90]:
pn['item1']

Unnamed: 0,bar,baz,foo,sky
0,a,,1.0,
1,b,,2.0,
2,c,,3.0,
3,,,,


In [91]:
pn['item2']

Unnamed: 0,bar,baz,foo,sky
0,,7,,p
1,,8,,q
2,,9,,r
3,,11,,t
