
# Pandas Review

##### Import Libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

---

## Installing and Using Pandas

In [2]:
pd.__version__

'1.0.4'

---

## Introducing Pandas Objects

### The Pandas Series Object

In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
data[1]

0.5

In [7]:
data[1:3]

1    0.50
2    0.75
dtype: float64

#### Series as generalized NumPy array

In [8]:
data = pd.Series([0.25, 0.5, 0.75, 1],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [9]:
population_dict = {'California': 38332521,
                    'Texas': 26448193,                           
                    'New York': 19651127,                           
                    'Florida': 19552860,                           
                    'Illinois': 12882135}        
population = pd.Series(population_dict)        
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [10]:
population['California']

38332521

In [11]:
population['Texas': 'New York']

Texas       26448193
New York    19651127
dtype: int64

In [12]:
data = pd.Series(np.random.randint(0, 10, 10))
data

0    2
1    4
2    5
3    4
4    8
5    6
6    7
7    7
8    9
9    1
dtype: int64

In [13]:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

### The Pandas DataFrame Object

In [14]:
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [0]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,             
             'Florida': 170312, 'Illinois': 149995}


In [16]:
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [17]:
states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [18]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [19]:
states.columns

Index(['population', 'area'], dtype='object')

In [20]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [21]:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [22]:
data = [{'a': i, 'b': 2**i} for i in range(3)]
data

[{'a': 0, 'b': 1}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]

In [23]:
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,1
1,1,2
2,2,4


In [24]:
 pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [25]:
pd.DataFrame(np.random.randint(0, 10, size=(3, 2)),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,7,9
b,1,8
c,0,9


### The Pandas Index Object

In [26]:
ind = pd.Index([2, 3, 4, 5, 11])
ind

Int64Index([2, 3, 4, 5, 11], dtype='int64')

In [27]:
ind[2]

4

In [28]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [0]:
indA = pd.Index([1, 3, 5, 7, 9])        
indB = pd.Index([2, 3, 5, 7, 11])

In [30]:
indA & indB

Int64Index([3, 5, 7], dtype='int64')

In [31]:
indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [32]:
indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

---

## Data Indexing and Selection

### Data Selecting in Series

In [33]:
data = pd.Series(np.random.randint(0, 10, 4),
                 index=list('abcd'))
data

a    7
b    2
c    2
d    9
dtype: int64

In [34]:
data['b']

2

In [35]:
'a' in data

True

In [36]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [37]:
list(data.items())

[('a', 7), ('b', 2), ('c', 2), ('d', 9)]

In [38]:
data['a':'c']

a    7
b    2
c    2
dtype: int64

In [39]:
data[0:2]

a    7
b    2
dtype: int64

In [40]:
data[(data > 3) & (data < 7)]

Series([], dtype: int64)

In [41]:
data

a    7
b    2
c    2
d    9
dtype: int64

In [42]:
data.iloc[1]

2

### Data Selection in DataFrame

In [43]:
area = pd.Series({'California': 423967, 'Texas': 695662,                          
                  'New York': 141297, 'Florida': 170312,                          
                  'Illinois': 149995})        
pop = pd.Series({'California': 38332521, 'Texas': 26448193,                         
                 'New York': 19651127, 'Florida': 19552860,                         
                 'Illinois': 12882135})        
data = pd.DataFrame({'area':area, 'pop':pop})        
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [44]:
print(data['area'])
print(data['pop'])

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64


In [45]:
data.area is data['area']

True

In [46]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [47]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [48]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [49]:
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [50]:
data.loc[data['density'] > 100, ['pop', 'density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [51]:
data[data['density'] > 100]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


---

### Operating on Data in Pandas

### UFuncs: Index Preservation

In [52]:
ser = pd.Series(np.random.randint(0, 10, 4))
ser

0    7
1    9
2    1
3    4
dtype: int64

In [53]:
df = pd.DataFrame(np.random.randint(0, 10, (3, 4)),
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,5,1,0,9
1,9,5,7,6
2,4,4,5,8


In [54]:
np.exp(ser)

0    1096.633158
1    8103.083928
2       2.718282
3      54.598150
dtype: float64

In [55]:
np.sin(df * np.pi  / 4)

Unnamed: 0,A,B,C,D
0,-0.7071068,0.7071068,0.0,0.7071068
1,0.7071068,-0.7071068,-0.707107,-1.0
2,1.224647e-16,1.224647e-16,-0.707107,-2.449294e-16


### UFuncs: Index Alignment

In [56]:
A = pd.Series([2, 3, 4], index=[0, 1, 2])
B = pd.Series([1, 2, 3], index=[1, 2, 3])
A + B

0    NaN
1    4.0
2    6.0
3    NaN
dtype: float64

In [57]:
A.add(B, fill_value=0)

0    2.0
1    4.0
2    6.0
3    3.0
dtype: float64

In [58]:
A = pd.DataFrame(np.random.randint(0, 10, (2, 2)))
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)))
print(A)
print(B)

   0  1
0  5  2
1  9  8
   0  1  2
0  8  5  8
1  1  1  3
2  7  3  8


In [59]:
A + B

Unnamed: 0,0,1,2
0,13.0,7.0,
1,10.0,9.0,
2,,,


In [60]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,0,1,2
0,13.0,7.0,14.0
1,10.0,9.0,9.0
2,13.0,9.0,14.0


In [61]:
df = pd.DataFrame(np.random.randint(0, 10, (3, 4)),
                  columns=list('QRST'))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,1,4,1,1
2,6,2,-3,0


In [62]:
something = df.iloc[2, ::2]
something

Q    8
S    2
Name: 2, dtype: int64

In [63]:
type((df - something).iloc[0, 1]) # type of a NaN

numpy.float64

---

## Handling Missing Data

In [64]:
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [65]:
vals2 = np.array([1, np.nan, 3, 4])
vals2

array([ 1., nan,  3.,  4.])

In [66]:
vals2.dtype

dtype('float64')

In [67]:
12090 + np.nan

nan

In [68]:
vals2.sum()

nan

In [69]:
np.nansum(vals2)

8.0

In [70]:
pd.Series([1, 2, np.nan, None, 4])

0    1.0
1    2.0
2    NaN
3    NaN
4    4.0
dtype: float64

### Operating on Null Values

In [71]:
df = pd.DataFrame(np.random.randint(0, 10, (3, 4)),
                  columns=list('ABCD'))
df.iloc[0::2, :1:2] = np.nan
df

Unnamed: 0,A,B,C,D
0,,4,1,7
1,9.0,2,6,3
2,,8,7,1


In [72]:
df[df.notnull()]

Unnamed: 0,A,B,C,D
0,,4,1,7
1,9.0,2,6,3
2,,8,7,1


In [73]:
df.dropna()

Unnamed: 0,A,B,C,D
1,9.0,2,6,3


In [74]:
df.dropna(axis=1)

Unnamed: 0,B,C,D
0,4,1,7
1,2,6,3
2,8,7,1


In [75]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [76]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [77]:
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

---

## Hierarchical Indexing

### A Multiply Indexed Series

In [78]:
index = [('California', 2000), ('California', 2010),                
         ('New York', 2000), ('New York', 2010),                
         ('Texas', 2000), ('Texas', 2010)]       
populations = [33871648, 37253956,                      
                        18976457, 19378102,                      
                        20851820, 25145561]       
pop = pd.Series(populations, index=index)       
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [79]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [80]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [81]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [82]:
pop['California']

2000    33871648
2010    37253956
dtype: int64

In [0]:
pop_df = pop.unstack()

In [84]:
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [85]:
pop_df = pd.DataFrame({'total': pop,
                      'under18': [9267089, 9284094,                                           
                                  4687374, 4318033,                                           
                                  5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [86]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


### Methods of MultiIndex Creation

In [87]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.024251,0.874112
a,2,0.225076,0.089542
b,1,0.657534,0.994704
b,2,0.891255,0.193048


In [88]:
data = {('California', 2000): 33871648,                
        ('California', 2010): 37253956,                
        ('Texas', 2000): 20851820,                
        ('Texas', 2010): 25145561,                
        ('New York', 2000): 18976457,                
        ('New York', 2010): 19378102}        
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [89]:
some_index = pd.MultiIndex.from_product([['A', 'B'], [1, 2]])
some_index

MultiIndex([('A', 1),
            ('A', 2),
            ('B', 1),
            ('B', 2)],
           )

In [0]:
some_df = pd.DataFrame(np.random.randint(0, 1000000, (4, 2)),
                       index=some_index,
                       columns=['data1', 'data2'])

In [91]:
some_df

Unnamed: 0,Unnamed: 1,data1,data2
A,1,214535,667002
A,2,263849,158685
B,1,858624,12822
B,2,530332,667860


In [92]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],                       
              codes=[[0, 0, 0, 1], [0, 1, 0, 1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 1),
            ('b', 2)],
           )

In [93]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [94]:
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [0]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                    names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],                                     
                                     names=['subject', 'type'])

In [96]:
index

MultiIndex([(2013, 1),
            (2013, 2),
            (2014, 1),
            (2014, 2)],
           names=['year', 'visit'])

In [97]:
columns

MultiIndex([(  'Bob',   'HR'),
            (  'Bob', 'Temp'),
            ('Guido',   'HR'),
            ('Guido', 'Temp'),
            (  'Sue',   'HR'),
            (  'Sue', 'Temp')],
           names=['subject', 'type'])

In [0]:
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

In [99]:
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,55.0,36.7,37.0,37.2,48.0,36.4
2013,2,41.0,34.5,33.0,37.4,47.0,37.0
2014,1,49.0,34.9,34.0,37.0,44.0,37.8
2014,2,12.0,35.0,32.0,36.7,39.0,39.1


In [100]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,37.0,37.2
2013,2,33.0,37.4
2014,1,34.0,37.0
2014,2,32.0,36.7


### Indexing and Slicing a MultiIndex

#### Multiply Indexed Series

In [101]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [102]:
pop['California', 2000]

33871648

In [103]:
pop['California']

year
2000    33871648
2010    37253956
dtype: int64

In [104]:
pop.loc[:, 2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [105]:
pop.loc['California': 'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [106]:
pop[pop > 22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [107]:
pop[['California', 'Texas']]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

#### Multiply Indexed DataFrame

In [108]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,55.0,36.7,37.0,37.2,48.0,36.4
2013,2,41.0,34.5,33.0,37.4,47.0,37.0
2014,1,49.0,34.9,34.0,37.0,44.0,37.8
2014,2,12.0,35.0,32.0,36.7,39.0,39.1


In [109]:
health_data['Guido', 'HR']

year  visit
2013  1        37.0
      2        33.0
2014  1        34.0
      2        32.0
Name: (Guido, HR), dtype: float64

In [110]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,55.0,36.7
2013,2,41.0,34.5


In [111]:
health_data.loc[(2013, 2), :]

subject  type
Bob      HR      41.0
         Temp    34.5
Guido    HR      33.0
         Temp    37.4
Sue      HR      47.0
         Temp    37.0
Name: (2013, 2), dtype: float64

In [0]:
idx = pd.IndexSlice

In [113]:
type(idx)

pandas.core.indexing._IndexSlice

In [114]:
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,55.0,37.0,48.0
2014,1,49.0,34.0,44.0


### Rearranging Multi-Indices

#### Sorted and Unsorted Indices

In [115]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]],
                                   names=['char', 'int'])
data = pd.Series(np.random.normal(size=6),
                 index=index)
data

char  int
a     1     -1.929501
      2      0.363069
c     1     -0.504282
      2     -1.550620
b     1      0.086679
      2      0.122196
dtype: float64

In [116]:
data.sort_index()

char  int
a     1     -1.929501
      2      0.363069
b     1      0.086679
      2      0.122196
c     1     -0.504282
      2     -1.550620
dtype: float64

#### Stacking and Unstacking Indices

In [117]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [118]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [119]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [120]:
pop.unstack()

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


#### Index setting and resetting

In [121]:
pop_flat = pop.reset_index(name='Population')
pop_flat

Unnamed: 0,state,year,Population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [122]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


### Data Aggregation on Multi-Indices

In [123]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,55.0,36.7,37.0,37.2,48.0,36.4
2013,2,41.0,34.5,33.0,37.4,47.0,37.0
2014,1,49.0,34.9,34.0,37.0,44.0,37.8
2014,2,12.0,35.0,32.0,36.7,39.0,39.1


In [124]:
data_mean = health_data.mean(level='year')
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,48.0,35.6,35.0,37.3,47.5,36.7
2014,30.5,34.95,33.0,36.85,41.5,38.45


In [125]:
health_data.mean(level='visit')

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,52.0,35.8,35.5,37.1,46.0,37.1
2,26.5,34.75,32.5,37.05,43.0,38.05


In [126]:
data_mean.mean(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,43.5,36.533333
2014,35.0,36.75


---

### Combining Datasets: Concat and Append

In [0]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

In [128]:
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [129]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [130]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
something = pd.concat([df1, df2])
something

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [131]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3)
print(df4)
print(pd.concat([df3, df4], axis=1))

    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1


In [132]:
x = make_df('AB', [0, 1])       
y = make_df('AB', [2, 3])
y.index = x.index  # make duplicate indices!       
print(x); print(y); print(pd.concat([x, y]))

    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3


In [133]:
pd.concat([x, y]).stack()

0  A    A0
   B    B0
1  A    A1
   B    B1
0  A    A2
   B    B2
1  A    A3
   B    B3
dtype: object

In [134]:
pd.concat([x, y], ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [135]:
pd.concat([x, y], keys=['x', 'y'])

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,0,A2,B2
y,1,A3,B3


In [136]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
pd.concat([df5, df6])

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [137]:
pd.concat([df5, df6], join='inner')

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [138]:
df1.append(df2)

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


---

## Combining Datasets: Merge and Join

### Categories of Joins

In [0]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]}) 

In [140]:
pd.merge(df1, df2)

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [141]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [0]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                     'supervisor': ['Carly', 'Guido', 'Steve']})

In [143]:
df4

Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve


In [144]:
pd.merge(df3, df4)

Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


In [0]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting','Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux', 'spreadsheets', 'organization']}) 

In [146]:
df5

Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization


In [147]:
pd.merge(df1, df5)

Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization


In [148]:
pd.merge(df1, df2, on='employee')

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [149]:
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})
df3

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000


In [150]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


### Specification of the Merge Key

In [151]:
pd.merge(df1, df3, left_on='employee', right_on='name').drop('name', axis=1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


In [0]:
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')

In [153]:
df1a

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR


In [154]:
df2a

Unnamed: 0_level_0,hire_date
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2008
Jake,2012
Sue,2014


In [155]:
pd.merge(df1a, df2a, left_index=True, right_index=True)

Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


In [156]:
df1a.join(df2a)

Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


In [157]:
pd.merge(df1a, df3, left_index=True, right_on='name')

Unnamed: 0,group,name,salary
0,Accounting,Bob,70000
1,Engineering,Jake,80000
2,Engineering,Lisa,120000
3,HR,Sue,90000


### Specifying Set Arithmetic for Joins

In [0]:
 df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],                            
                     'food': ['fish', 'beans', 'bread']},                           
                    columns=['name', 'food'])
 df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],                            
                     'drink': ['wine', 'beer']},                           
                    columns=['name', 'drink']) 

In [159]:
df6

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread


In [160]:
df7

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer


In [161]:
pd.merge(df6, df7)

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [162]:
pd.merge(df6, df7, how='inner')

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [163]:
pd.merge(df6, df7, how='outer')

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine
3,Joseph,,beer


In [164]:
pd.merge(df6, df7, how='left')

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine


### Overlapping Column Names: The suffixes Keyword

In [0]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],                            
                    'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],                            
                    'rank': [3, 1, 4, 2]}) 

In [166]:
df8

Unnamed: 0,name,rank
0,Bob,1
1,Jake,2
2,Lisa,3
3,Sue,4


In [167]:
df9

Unnamed: 0,name,rank
0,Bob,3
1,Jake,1
2,Lisa,4
3,Sue,2


In [168]:
pd.merge(df8, df9, on='name')

Unnamed: 0,name,rank_x,rank_y
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


In [169]:
pd.merge(df8, df9, on='name', suffixes=['_one', '_two'])

Unnamed: 0,name,rank_one,rank_two
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


### Example: US States Data

In [170]:
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv 
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv 
!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 57935  100 57935    0     0   254k      0 --:--:-- --:--:-- --:--:--  254k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   835  100   835    0     0   3479      0 --:--:-- --:--:-- --:--:--  3464
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   872  100   872    0     0   4152      0 --:--:-- --:--:-- --:--:--  4152


In [0]:
pop = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')

In [172]:
pop.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [173]:
areas.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [174]:
abbrevs.head()

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [175]:
areas_abbrevs = pd.merge(areas, abbrevs, left_on='state', right_on='state', how='outer')
areas_abbrevs.head()

Unnamed: 0,state,area (sq. mi),abbreviation
0,Alabama,52423,AL
1,Alaska,656425,AK
2,Arizona,114006,AZ
3,Arkansas,53182,AR
4,California,163707,CA


In [176]:
merged = pd.merge(areas_abbrevs, pop, left_on='abbreviation', right_on='state/region', how='inner')
merged.head()

Unnamed: 0,state,area (sq. mi),abbreviation,state/region,ages,year,population
0,Alabama,52423,AL,AL,under18,2012,1117489.0
1,Alabama,52423,AL,AL,total,2012,4817528.0
2,Alabama,52423,AL,AL,under18,2010,1130966.0
3,Alabama,52423,AL,AL,total,2010,4785570.0
4,Alabama,52423,AL,AL,under18,2011,1125763.0


In [177]:
merged.drop('abbreviation', axis=1, inplace=True)
merged.head()

Unnamed: 0,state,area (sq. mi),state/region,ages,year,population
0,Alabama,52423,AL,under18,2012,1117489.0
1,Alabama,52423,AL,total,2012,4817528.0
2,Alabama,52423,AL,under18,2010,1130966.0
3,Alabama,52423,AL,total,2010,4785570.0
4,Alabama,52423,AL,under18,2011,1125763.0


In [178]:
merged.isnull().any()

state            False
area (sq. mi)    False
state/region     False
ages             False
year             False
population       False
dtype: bool

---

## Aggregation and Grouping

### Planets Data

In [179]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape

  import pandas.util.testing as tm


(1035, 6)

### Simple Aggregation in Pandas

In [180]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [181]:
ser.sum()

2.811925491708157

In [182]:
ser.mean()

0.5623850983416314

In [183]:
df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})
df

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [184]:
df.mean()

A    0.477888
B    0.443420
dtype: float64

In [185]:
df.mean(axis=1)

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [186]:
planets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB


In [187]:
planets.method.head()

0    Radial Velocity
1    Radial Velocity
2    Radial Velocity
3    Radial Velocity
4    Radial Velocity
Name: method, dtype: object

In [188]:
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


### GroupBy: Split, Apply, Combine