
# Pandas Review

## Handling Missing Data

In [64]:
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [65]:
vals2 = np.array([1, np.nan, 3, 4])
vals2

array([ 1., nan,  3.,  4.])

In [66]:
vals2.dtype

dtype('float64')

In [67]:
12090 + np.nan

nan

In [68]:
vals2.sum()

nan

In [69]:
np.nansum(vals2)

8.0

In [70]:
pd.Series([1, 2, np.nan, None, 4])

0    1.0
1    2.0
2    NaN
3    NaN
4    4.0
dtype: float64

### Operating on Null Values

In [71]:
df = pd.DataFrame(np.random.randint(0, 10, (3, 4)),
                  columns=list('ABCD'))
df.iloc[0::2, :1:2] = np.nan
df

Unnamed: 0,A,B,C,D
0,,4,1,7
1,9.0,2,6,3
2,,8,7,1


In [72]:
df[df.notnull()]

Unnamed: 0,A,B,C,D
0,,4,1,7
1,9.0,2,6,3
2,,8,7,1


In [73]:
df.dropna()

Unnamed: 0,A,B,C,D
1,9.0,2,6,3


In [74]:
df.dropna(axis=1)

Unnamed: 0,B,C,D
0,4,1,7
1,2,6,3
2,8,7,1


In [75]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [76]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [77]:
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

---

## Hierarchical Indexing

### A Multiply Indexed Series

In [78]:
index = [('California', 2000), ('California', 2010),                
         ('New York', 2000), ('New York', 2010),                
         ('Texas', 2000), ('Texas', 2010)]       
populations = [33871648, 37253956,                      
                        18976457, 19378102,                      
                        20851820, 25145561]       
pop = pd.Series(populations, index=index)       
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [79]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [80]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [81]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [82]:
pop['California']

2000    33871648
2010    37253956
dtype: int64

In [0]:
pop_df = pop.unstack()

In [84]:
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [85]:
pop_df = pd.DataFrame({'total': pop,
                      'under18': [9267089, 9284094,                                           
                                  4687374, 4318033,                                           
                                  5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [86]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


### Methods of MultiIndex Creation

In [87]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.024251,0.874112
a,2,0.225076,0.089542
b,1,0.657534,0.994704
b,2,0.891255,0.193048


In [88]:
data = {('California', 2000): 33871648,                
        ('California', 2010): 37253956,                
        ('Texas', 2000): 20851820,                
        ('Texas', 2010): 25145561,                
        ('New York', 2000): 18976457,                
        ('New York', 2010): 19378102}        
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [89]:
some_index = pd.MultiIndex.from_product([['A', 'B'], [1, 2]])
some_index

MultiIndex([('A', 1),
            ('A', 2),
            ('B', 1),
            ('B', 2)],
           )

In [0]:
some_df = pd.DataFrame(np.random.randint(0, 1000000, (4, 2)),
                       index=some_index,
                       columns=['data1', 'data2'])

In [91]:
some_df

Unnamed: 0,Unnamed: 1,data1,data2
A,1,214535,667002
A,2,263849,158685
B,1,858624,12822
B,2,530332,667860


In [92]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],                       
              codes=[[0, 0, 0, 1], [0, 1, 0, 1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 1),
            ('b', 2)],
           )

In [93]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [94]:
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [0]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                    names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],                                     
                                     names=['subject', 'type'])

In [96]:
index

MultiIndex([(2013, 1),
            (2013, 2),
            (2014, 1),
            (2014, 2)],
           names=['year', 'visit'])

In [97]:
columns

MultiIndex([(  'Bob',   'HR'),
            (  'Bob', 'Temp'),
            ('Guido',   'HR'),
            ('Guido', 'Temp'),
            (  'Sue',   'HR'),
            (  'Sue', 'Temp')],
           names=['subject', 'type'])

In [0]:
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

In [99]:
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,55.0,36.7,37.0,37.2,48.0,36.4
2013,2,41.0,34.5,33.0,37.4,47.0,37.0
2014,1,49.0,34.9,34.0,37.0,44.0,37.8
2014,2,12.0,35.0,32.0,36.7,39.0,39.1


In [100]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,37.0,37.2
2013,2,33.0,37.4
2014,1,34.0,37.0
2014,2,32.0,36.7


### Indexing and Slicing a MultiIndex

#### Multiply Indexed Series

In [101]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [102]:
pop['California', 2000]

33871648

In [103]:
pop['California']

year
2000    33871648
2010    37253956
dtype: int64

In [104]:
pop.loc[:, 2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [105]:
pop.loc['California': 'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [106]:
pop[pop > 22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [107]:
pop[['California', 'Texas']]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

#### Multiply Indexed DataFrame

In [108]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,55.0,36.7,37.0,37.2,48.0,36.4
2013,2,41.0,34.5,33.0,37.4,47.0,37.0
2014,1,49.0,34.9,34.0,37.0,44.0,37.8
2014,2,12.0,35.0,32.0,36.7,39.0,39.1


In [109]:
health_data['Guido', 'HR']

year  visit
2013  1        37.0
      2        33.0
2014  1        34.0
      2        32.0
Name: (Guido, HR), dtype: float64

In [110]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,55.0,36.7
2013,2,41.0,34.5


In [111]:
health_data.loc[(2013, 2), :]

subject  type
Bob      HR      41.0
         Temp    34.5
Guido    HR      33.0
         Temp    37.4
Sue      HR      47.0
         Temp    37.0
Name: (2013, 2), dtype: float64

In [0]:
idx = pd.IndexSlice

In [113]:
type(idx)

pandas.core.indexing._IndexSlice

In [114]:
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,55.0,37.0,48.0
2014,1,49.0,34.0,44.0


### Rearranging Multi-Indices

#### Sorted and Unsorted Indices

In [115]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]],
                                   names=['char', 'int'])
data = pd.Series(np.random.normal(size=6),
                 index=index)
data

char  int
a     1     -1.929501
      2      0.363069
c     1     -0.504282
      2     -1.550620
b     1      0.086679
      2      0.122196
dtype: float64

In [116]:
data.sort_index()

char  int
a     1     -1.929501
      2      0.363069
b     1      0.086679
      2      0.122196
c     1     -0.504282
      2     -1.550620
dtype: float64

#### Stacking and Unstacking Indices

In [117]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [118]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [119]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [120]:
pop.unstack()

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


#### Index setting and resetting

In [121]:
pop_flat = pop.reset_index(name='Population')
pop_flat

Unnamed: 0,state,year,Population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [122]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


### Data Aggregation on Multi-Indices

In [123]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,55.0,36.7,37.0,37.2,48.0,36.4
2013,2,41.0,34.5,33.0,37.4,47.0,37.0
2014,1,49.0,34.9,34.0,37.0,44.0,37.8
2014,2,12.0,35.0,32.0,36.7,39.0,39.1


In [124]:
data_mean = health_data.mean(level='year')
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,48.0,35.6,35.0,37.3,47.5,36.7
2014,30.5,34.95,33.0,36.85,41.5,38.45


In [125]:
health_data.mean(level='visit')

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,52.0,35.8,35.5,37.1,46.0,37.1
2,26.5,34.75,32.5,37.05,43.0,38.05


In [126]:
data_mean.mean(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,43.5,36.533333
2014,35.0,36.75


---

### Combining Datasets: Concat and Append

In [0]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

In [128]:
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [129]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [130]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
something = pd.concat([df1, df2])
something

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [131]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3)
print(df4)
print(pd.concat([df3, df4], axis=1))

    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1


In [132]:
x = make_df('AB', [0, 1])       
y = make_df('AB', [2, 3])
y.index = x.index  # make duplicate indices!       
print(x); print(y); print(pd.concat([x, y]))

    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3


In [133]:
pd.concat([x, y]).stack()

0  A    A0
   B    B0
1  A    A1
   B    B1
0  A    A2
   B    B2
1  A    A3
   B    B3
dtype: object

In [134]:
pd.concat([x, y], ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [135]:
pd.concat([x, y], keys=['x', 'y'])

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,0,A2,B2
y,1,A3,B3


In [136]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
pd.concat([df5, df6])

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [137]:
pd.concat([df5, df6], join='inner')

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [138]:
df1.append(df2)

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


---