In [1]:
%matplotlib inline
from IPython.core.display import HTML
from IPython.display import YouTubeVideo
from pandas_datareader import data, wb

import os
import pandas as pd
import numpy as np
import datetime

path1 = os.path.join(os.getcwd(),'style-table.css')
path2 = os.path.join(os.getcwd(),'style-notebook.css')

css = open(path1).read() + open(path2).read()
HTML('<style>{}</style>'.format(css))

### searching for outliers using describe()

In [52]:
# set a seed of 12345

np.random.seed(12345)

In [53]:
# make dataframe from 1000x4 numpy array of random numbers

df = pd.DataFrame( np.random.randn(1000,4) )
df.tail(2)

Unnamed: 0,0,1,2,3
998,-1.293122,-0.322542,-0.78296,-0.30334
999,0.089987,0.292291,1.177706,0.882755


In [54]:
# show basic stats about each column of df

df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [55]:
# show basic stats about first column of df
df.describe()[0]

count    1000.000000
mean       -0.067684
std         0.998035
min        -3.428254
25%        -0.774890
50%        -0.116401
75%         0.616366
max         3.366626
Name: 0, dtype: float64

### filtering through a series

In [56]:
# put first column of df into a series

col = df[0]

In [57]:
# return only rows where absolute value is above 3
# using pure python abs()

col[abs(col > 3)]

900    3.366626
Name: 0, dtype: float64

In [58]:
# return only rows where absolute value is above 3
# using numpy's abs()

col[np.abs(col > 3)]

900    3.366626
Name: 0, dtype: float64

### filtering through a dataframe

In [59]:
# return all rows where at least 1 cell has abs() > 3 

df[(np.abs(df) > 3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [60]:
# how np.sign works: np.sign(7) = 1 / np.sign(-4) = -1

In [61]:
# cap outliers at abs() > 3
# any cell in df over 3 becomes 1*3
# any cell in df less than -3 becomes -1*3

df[np.abs(df)>3] = np.sign(df)*3

In [65]:
# notice min and max are capped to -3 and 3 now

df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


### permutations

In [4]:
# make a dataframe from a 4x4 np array

df = pd.DataFrame( np.arange(16).reshape(4,4) )
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [5]:
# make an array with random permutation of numbers 0-3

blender = np.random.permutation(4)
blender

array([0, 3, 1, 2])

In [6]:
# reorder row index order of df according to blender

df.take(blender)

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11


### permutations with replacement

In [25]:
# make a np array with numbers from 1 to 3

box = np.arange(1,4)
box

array([1, 2, 3])

In [26]:
# generate a random number between 0 and len(box), 10 times

shaker = np.random.randint(0,len(box),size=10)
shaker

array([2, 2, 0, 1, 0, 2, 1, 2, 2, 1])

In [27]:
# simulate: 
# 1. take a number from box
# 2. put it back in
# 3. shake
# 4. take again, repeat x10

box.take(shaker)

array([3, 3, 1, 2, 1, 3, 2, 3, 3, 2])

### grouping dataframes by dictionaries

In [None]:
# groupby() gives you categories
# you can really define anything as a category

In [32]:
# make a dataframe from a 4x4 np array
# add some null values to the dataframe

animals = pd.DataFrame( np.arange(16).reshape(4,4),
                      columns=['W','X','Y','Z'],
                      index=['Dog','Cat','Brid','Mouse'])

animals.ix[1:2,['W','Y']] = np.nan

animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Brid,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [33]:
behavior_map = {'W':'good','X': 'bad','Y':'good','Z':'bad'}

In [39]:
# create groupby object, group columns by behavior_map

animal_col = animals.groupby(behavior_map,axis=1)

In [40]:
animal_col.sum()

Unnamed: 0,bad,good
Dog,4,2.0
Cat,12,
Brid,20,18.0
Mouse,28,26.0


In [41]:
animal_col.size()

bad     2
good    2
dtype: int64

### grouping dataframes by series

In [32]:
# make a dataframe from a 4x4 np array
# add some null values to the dataframe

animals = pd.DataFrame( np.arange(16).reshape(4,4),
                      columns=['W','X','Y','Z'],
                      index=['Dog','Cat','Brid','Mouse'])

animals.ix[1:2,['W','Y']] = np.nan

animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Brid,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [58]:
behavior_map = {'W':'good','X': 'bad','Y':'good','Z':'bad'}

In [60]:
# make a series out of the dictionary

behav_series = pd.Series(behavior_map)
behav_series

W    good
X     bad
Y    good
Z     bad
dtype: object

In [64]:
# group by behav_series, show count
# dog had 2 instances on the bad columns and 2 on good

animals.groupby(behav_series,axis=1).count()

Unnamed: 0,bad,good
Dog,2,2
Cat,2,0
Brid,2,2
Mouse,2,2


In [65]:
animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Brid,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


### grouping dataframes by functions

In [72]:
# make a dataframe from a 4x4 np array
# add some null values to the dataframe

animals = pd.DataFrame( np.arange(16).reshape(4,4),
                      columns=['W','X','Y','Z'],
                      index=['Dog','Cat','Brid','Mouse'])

animals.ix[1:2,['W','Y']] = np.nan

animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Brid,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [73]:
# group by length of the indexes, show sums
# dog and cat get grouped into row 1 becase of same length

animals.groupby(len).sum()

Unnamed: 0,W,X,Y,Z
3,0,6,2,10
4,8,9,10,11
5,12,13,14,15


### grouping dataframes by mixed types

In [72]:
# make a dataframe from a 4x4 np array
# add some null values to the dataframe

animals = pd.DataFrame( np.arange(16).reshape(4,4),
                      columns=['W','X','Y','Z'],
                      index=['Dog','Cat','Brid','Mouse'])

animals.ix[1:2,['W','Y']] = np.nan

animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Brid,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [71]:
keys = ['A','B','A','B']

In [74]:
# group by a function and a list

animals.groupby([len,keys]).max()

Unnamed: 0,Unnamed: 1,W,X,Y,Z
3,A,0.0,1,2.0,3
3,B,,5,,7
4,A,8.0,9,10.0,11
5,B,12.0,13,14.0,15


### grouping dataframes from a MultiIndex object

In [84]:
# make index
hier_col = pd.MultiIndex.from_arrays(
    [['NY','NY','NY','SF','SF'],[1,2,3,1,2]],
    names=['City','sub_value'])

In [83]:
# make dataframe, using the index above

dframe_hr = pd.DataFrame( np.arange(25).reshape(5,5),
                          columns=hier_col )
dframe_hr                          

City,NY,NY,NY,SF,SF
sub_value,1,2,3,1,2
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [81]:
dframe_hr = dframe_hr * 100
dframe_hr

City,NY,NY,NY,SF,SF
sub_value,1,2,3,1,2
0,0,100,200,300,400
1,500,600,700,800,900
2,1000,1100,1200,1300,1400
3,1500,1600,1700,1800,1900
4,2000,2100,2200,2300,2400


### grouping dataframes by their own columns

In [42]:
# create a dataframe from a dictionary with 4 keys/values

df1 = pd.DataFrame( { 'col1' : ['X','X','Y','Y','Z'] ,
                      'col2' : ['alpha','beta','alpha','beta','alpha'],
                  'dataset1' : np.random.randn(5),
                  'dataset2' : np.random.randn(5) })
df1

Unnamed: 0,col1,col2,dataset1,dataset2
0,X,alpha,-0.122771,-0.306369
1,X,beta,-0.494568,0.400328
2,Y,alpha,0.456802,-0.222658
3,Y,beta,-0.043058,-0.054694
4,Z,alpha,-1.914885,0.002023


In [43]:
# make a SeriesGroupBy object that can be queried with
# .max() .min() .mean() .size() and others

df1.dataset1.groupby(df1['col1'])

<pandas.core.groupby.SeriesGroupBy object at 0x000000000857AA90>

In [44]:
df1.dataset1.groupby(df1['col2']).mean()

col2
alpha   -0.526951
beta    -0.268813
Name: dataset1, dtype: float64

In [45]:
df1.dataset1.groupby(df1['col2']).size()

col2
alpha    3
beta     2
dtype: int64

In [46]:
type(df1.groupby('col1'))

pandas.core.groupby.DataFrameGroupBy

In [47]:
df1.groupby('col1').mean()

Unnamed: 0_level_0,dataset1,dataset2
col1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.30867,0.04698
Y,0.206872,-0.138676
Z,-1.914885,0.002023


In [48]:
df1.groupby(['col1','col2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
col1,col2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-0.122771,-0.306369
X,beta,-0.494568,0.400328
Y,alpha,0.456802,-0.222658
Y,beta,-0.043058,-0.054694
Z,alpha,-1.914885,0.002023


In [49]:
df1.groupby(['col1','col2']).mean().unstack()

Unnamed: 0_level_0,dataset1,dataset1,dataset2,dataset2
col2,alpha,beta,alpha,beta
col1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
X,-0.122771,-0.494568,-0.306369,0.400328
Y,0.456802,-0.043058,-0.222658,-0.054694
Z,-1.914885,,0.002023,


In [50]:
df1.groupby('col1').size()

col1
X    2
Y    2
Z    1
dtype: int64

### running for loops on groupby objects

In [51]:
for name,group in df1.groupby('col1'):
    print()
    print(name,"group:")
    print(group)


X group:
  col1   col2  dataset1  dataset2
0    X  alpha -0.122771 -0.306369
1    X   beta -0.494568  0.400328

Y group:
  col1   col2  dataset1  dataset2
2    Y  alpha  0.456802 -0.222658
3    Y   beta -0.043058 -0.054694

Z group:
  col1   col2  dataset1  dataset2
4    Z  alpha -1.914885  0.002023


In [52]:
df1

Unnamed: 0,col1,col2,dataset1,dataset2
0,X,alpha,-0.122771,-0.306369
1,X,beta,-0.494568,0.400328
2,Y,alpha,0.456802,-0.222658
3,Y,beta,-0.043058,-0.054694
4,Z,alpha,-1.914885,0.002023


In [53]:
for (c1,c2) , group in df1.groupby(['col1','col2']):
    print("||| Key 1 = {} ||| Key2 = {} |||".format(c1,c2))
    print(group,"\n")

||| Key 1 = X ||| Key2 = alpha |||
  col1   col2  dataset1  dataset2
0    X  alpha -0.122771 -0.306369 

||| Key 1 = X ||| Key2 = beta |||
  col1  col2  dataset1  dataset2
1    X  beta -0.494568  0.400328 

||| Key 1 = Y ||| Key2 = alpha |||
  col1   col2  dataset1  dataset2
2    Y  alpha  0.456802 -0.222658 

||| Key 1 = Y ||| Key2 = beta |||
  col1  col2  dataset1  dataset2
3    Y  beta -0.043058 -0.054694 

||| Key 1 = Z ||| Key2 = alpha |||
  col1   col2  dataset1  dataset2
4    Z  alpha -1.914885  0.002023 



### converting groupby objects to dictionaries

In [54]:
group_dict = dict(list(df1.groupby('col1')))
group_dict

{'X':   col1   col2  dataset1  dataset2
 0    X  alpha -0.122771 -0.306369
 1    X   beta -0.494568  0.400328, 'Y':   col1   col2  dataset1  dataset2
 2    Y  alpha  0.456802 -0.222658
 3    Y   beta -0.043058 -0.054694, 'Z':   col1   col2  dataset1  dataset2
 4    Z  alpha -1.914885  0.002023}

In [55]:
group_dict['X']

Unnamed: 0,col1,col2,dataset1,dataset2
0,X,alpha,-0.122771,-0.306369
1,X,beta,-0.494568,0.400328


In [56]:
df1[df1.col1 == 'X']

Unnamed: 0,col1,col2,dataset1,dataset2
0,X,alpha,-0.122771,-0.306369
1,X,beta,-0.494568,0.400328


In [57]:
# separate the dataframe by the types of data

group_dict_axis1 = dict(list(df1.groupby(df1.dtypes,axis=1)))
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -0.122771 -0.306369
 1 -0.494568  0.400328
 2  0.456802 -0.222658
 3 -0.043058 -0.054694
 4 -1.914885  0.002023, dtype('O'):   col1   col2
 0    X  alpha
 1    X   beta
 2    Y  alpha
 3    Y   beta
 4    Z  alpha}

In [74]:
# look at dataset2 only
# group by col1 and col2
# average up all objects

dataset2group = df1.groupby(['col1','col2'])['dataset2']
dataset2group.mean()

col1  col2 
X     alpha   -1.130622
      beta    -0.151951
Y     alpha    0.800944
      beta    -0.473907
Z     alpha   -0.257534
Name: dataset2, dtype: float64