In [31]:
# Python for Data Analysis

# p. 123

import numpy as np
import pandas as pd


from pandas import Series, DataFrame

In [6]:
# p. 123

# introduction to Series: an array and an associated index

obj = pd.Series([4, 7, -5, -3])
obj 

0    4
1    7
2   -5
3   -3
dtype: int64

In [7]:
# this returns the actual array
obj.values

array([ 4,  7, -5, -3], dtype=int64)

In [8]:
# this returns the indices
obj.index

RangeIndex(start=0, stop=4, step=1)

In [10]:
# you can also create Series with labels
obj2 = pd.Series([1, 2, 3, 4], index = ['a', 'b', 'c', 'd'])
obj2

a    1
b    2
c    3
d    4
dtype: int64

In [11]:
# p. 125
# you can then get values with those indices:
obj2['a']

1

In [14]:
# you can get subsets with number indices nonetheless
obj2[obj2>1]

b    2
c    3
d    4
dtype: int64

In [15]:
# and you can do whatever other stuff like
obj2 * obj2

a     1
b     4
c     9
d    16
dtype: int64

In [18]:
# p. 126

# Series are quite like dicts and you can convert between them

# here's looking for nulls in pandas

pd.isnull(obj2)

# there's also pd.notnull()



a    False
b    False
c    False
d    False
dtype: bool

In [19]:
# p. 127

# this is like a join:
obj + obj2

0   NaN
1   NaN
2   NaN
3   NaN
a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64

In [22]:
# p. 128

# ---DATAFRAMES----

# has both a row and column index: "like a dict of Series all sharing the same index."


# p. 129
# you can make one from a dict:

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
       }
frame = pd.DataFrame(data)
frame
# ooh, pretty printing

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [23]:
# R-like head
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [1]:
# p. 132

# nested dict of dicts
pop = { 'Nevada' : {2001: 2.4, 2002: 2.9},
       'Ohio' : {2000: 1.5, 2001: 1.7, 2002:3.6}  
}

# 'outer' dict keys are interpreted as indices when passed into dataframes:
frame3 = pd.DataFrame(pop)

frame3


NameError: name 'pd' is not defined

In [26]:
# p. 133

# transposing! 
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [28]:


# you can put titles or whatever you want to call them on the rows and columns: 
frame3.index.name = 'year'; frame3.columns.name = 'state'

frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [29]:
# Table 5.1 has a complete list of things you can pass to DataFrame
# they include lists of lists, lists of tuples, stuff like that. 

In [33]:
# p. 140

# ---INDEXING, SELECTING, AND FILTERING---

obj4 = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])

obj4

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [34]:
# slicing with labels is different from slicing with numbers:
obj4[1:3]

b    1.0
c    2.0
dtype: float64

In [36]:
# slicing with labels is inclusive: 
obj4['b':'d']

b    1.0
c    2.0
d    3.0
dtype: float64

In [37]:
# p. 142
# indexing dataframes
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [38]:
frame3['Nevada']

year
2000    NaN
2001    2.4
2002    2.9
Name: Nevada, dtype: float64

In [42]:
frame3[['Nevada', 'Ohio']]

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [45]:
# p. 143

# use loc and iloc to get more specific than just columns
frame3.loc[[2000, 2001], 'Nevada']

year
2000    NaN
2001    2.4
Name: Nevada, dtype: float64

In [48]:
# iloc uses numeric indices
frame3.iloc[[0,1], 0]

year
2000    NaN
2001    2.4
Name: Nevada, dtype: float64

In [50]:
# iloc with slices: omit the nested square brackests
frame3.iloc[0:2, 0]

year
2000    NaN
2001    2.4
Name: Nevada, dtype: float64

In [51]:
# p. 151-2

# ---FUNCTION APPLICATION AND MAPPING---

# applying a function to each column

f = lambda x: x.max() - x.min()

frame3.apply(f)

state
Nevada    0.5
Ohio      2.1
dtype: float64

In [52]:
# apply a function to each row by adding this argument:

frame3.apply(f, axis='columns')

year
2000    0.0
2001    0.7
2002    0.7
dtype: float64

In [53]:
# p. 160 - useful table of summary statistics like .sum() you can apply to these objects

# p. 162 - finding unique values, value counts, membership (isin)

In [None]:
# p. 191

# ---DATA CLEANING AND PREPARATION---

# key methods:
# .dropna()
# .fillna()

# .duplicated() - find duplicate rows
# .drop_duplicates()

# .map() - creating a new column using a key (dict) or function