In [2]:
import numpy as np
import pandas as pd

In [1]:
seq1 = ['foo', 'bar', 'baz']

In [2]:
seq2 = ['one', 'two', 'three']

In [3]:
seq1

['foo', 'bar', 'baz']

In [4]:
seq2

['one', 'two', 'three']

In [5]:
list(zip(seq1, seq2))

[('foo', 'one'), ('bar', 'two'), ('baz', 'three')]

In [6]:
enumerate(list(zip(seq1, seq2)))

<enumerate at 0x2510cff5d68>

In [7]:
for index, value in enumerate(zip(seq1, seq2)):
    print(f"index:{index} - value: {value}")

index:0 - value: ('foo', 'one')
index:1 - value: ('bar', 'two')
index:2 - value: ('baz', 'three')


In [8]:
for index, (val1, val2) in enumerate(zip(seq1, seq2)):
    print(f"index: {index} - value1: {val1} & value2:{val2}")

index: 0 - value1: foo & value2:one
index: 1 - value1: bar & value2:two
index: 2 - value1: baz & value2:three


In [9]:
pitchers = [('Nolan', 'Ryan'), ('Roger', 'Clemens'),
           ('Schilling', 'Curt')]

In [10]:
first_name, last_name = zip(*pitchers)

In [11]:
first_name

('Nolan', 'Roger', 'Schilling')

In [12]:
last_name

('Ryan', 'Clemens', 'Curt')

In [13]:
seq1

['foo', 'bar', 'baz']

In [14]:
seq2

['one', 'two', 'three']

In [15]:
dict(zip(seq1, seq2))

{'foo': 'one', 'bar': 'two', 'baz': 'three'}

In [16]:
words  = ['apple', 'bat', 'bar','atom', 'book']

In [17]:
by_letter = {}

In [18]:
for word in words:
    letter = word[0]
    if letter not in by_letter:
        by_letter[letter] = [word]
    else:
        by_letter[letter].append(word)

In [19]:
by_letter

{'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']}

In [20]:
by_letter2 = {}

In [21]:
for word in words:
    letter = word[0]
    by_letter2.setdefault(letter, []).append(word)

In [22]:
by_letter2

{'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']}

In [23]:
a = {1,2,3}

In [24]:
a

{1, 2, 3}

In [25]:
a.add(2)

In [26]:
a

{1, 2, 3}

In [27]:
a.add(5)

In [28]:
a

{1, 2, 3, 5}

In [29]:
strings = ['a', 'as', 'bat', 'car', 'dove', 'python']

In [30]:
[string.upper() for string in strings]

['A', 'AS', 'BAT', 'CAR', 'DOVE', 'PYTHON']

In [31]:
strings

['a', 'as', 'bat', 'car', 'dove', 'python']

In [32]:
[string.upper() for string in strings if len(string)>2]

['BAT', 'CAR', 'DOVE', 'PYTHON']

In [33]:
strings

['a', 'as', 'bat', 'car', 'dove', 'python']

In [34]:
{len(string) for string  in strings}

{1, 2, 3, 4, 6}

In [35]:
set(map(len, strings))

{1, 2, 3, 4, 6}

In [36]:
strings

['a', 'as', 'bat', 'car', 'dove', 'python']

In [37]:
loc_mapping = {index:value for index, value in enumerate(strings)}

In [38]:
loc_mapping

{0: 'a', 1: 'as', 2: 'bat', 3: 'car', 4: 'dove', 5: 'python'}

In [39]:
loc_mapping = {value:index for index, value in enumerate(strings)}

In [40]:
loc_mapping

{'a': 0, 'as': 1, 'bat': 2, 'car': 3, 'dove': 4, 'python': 5}

In [41]:
all_data = [['John', 'Emily', 'Micheal', 'Mary', 'Steven'],
           ['Maria', 'Juan', 'Javier', 'Natalia', 'Pillar']]

In [42]:
name_of_interest = []

In [43]:
for names in all_data:
    enough_es = [name for name in names if name.count('e')>=2]
    name_of_interest.extend(enough_es)

In [44]:
name_of_interest

['Steven']

In [45]:
result = [name for names in all_data for name in names if name.count('e')>=2]

In [46]:
result

['Steven']

In [47]:
some_tuples = [(1,2,3),(4,5,6),(7,8,9)]

In [48]:
flattened = [tup for tuples in some_tuples for tup in tuples]

In [49]:
flattened

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [50]:
[[tup for tup in tuples]for tuples in some_tuples]

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [52]:
my_arr = np.arange(1000000)

In [53]:
my_list = list(range(1000000))

In [54]:
%time for _ in range(10): my_arr2 = my_arr * 2

Wall time: 48.9 ms


In [55]:
%time for _ in range(10): my_list2 = my_list * 2

Wall time: 655 ms


In [56]:
data2 = [[1,2,3],[4,5,6],[7,8,9]]

In [57]:
data2

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [58]:
np.full_like(data2, 5)

array([[5, 5, 5],
       [5, 5, 5],
       [5, 5, 5]])

In [59]:
arr3d= [[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]]

In [60]:
arr3d

[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]

In [61]:
arr3 = np.asarray(arr3d)

In [62]:
arr3

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [63]:
points = np.arange(-5,5,0.01)

In [66]:
xs, ys = np.meshgrid(points, points)

In [67]:
xs

array([[-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       ...,
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99]])

In [68]:
ys

array([[-5.  , -5.  , -5.  , ..., -5.  , -5.  , -5.  ],
       [-4.99, -4.99, -4.99, ..., -4.99, -4.99, -4.99],
       [-4.98, -4.98, -4.98, ..., -4.98, -4.98, -4.98],
       ...,
       [ 4.97,  4.97,  4.97, ...,  4.97,  4.97,  4.97],
       [ 4.98,  4.98,  4.98, ...,  4.98,  4.98,  4.98],
       [ 4.99,  4.99,  4.99, ...,  4.99,  4.99,  4.99]])

In [69]:
xarr = np.array([1.1,1.2,1.3,1.4,1.5])

In [70]:
yarr = np.array([2.1,2.2,2.3,2.4,2.5])

In [71]:
cond = np.array([True, False , True, True, False])

In [72]:
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]

In [73]:
result

[1.1, 2.2, 1.3, 1.4, 2.5]

In [75]:
np.where(cond, xarr, yarr)

array([1.1, 2.2, 1.3, 1.4, 2.5])

In [85]:
arr = np.random.randn(4,4)

In [86]:
arr

array([[ 0.44081534,  0.57038304,  0.57499943,  1.04038086],
       [ 0.35725698, -0.0769408 , -0.9199575 ,  1.48121799],
       [-0.63319167,  0.58169913,  0.65285194, -2.80756693],
       [-0.59260836,  1.71145509,  1.47815797,  1.59598609]])

In [87]:
np.where(arr < 0, 0, arr)

array([[0.44081534, 0.57038304, 0.57499943, 1.04038086],
       [0.35725698, 0.        , 0.        , 1.48121799],
       [0.        , 0.58169913, 0.65285194, 0.        ],
       [0.        , 1.71145509, 1.47815797, 1.59598609]])

In [88]:
values = np.array([6,0,0,3,2,5,6])

In [89]:
values

array([6, 0, 0, 3, 2, 5, 6])

In [90]:
np.in1d(values, [2,3,6])

array([ True, False, False,  True,  True, False,  True])

In [92]:
pop = {
        'Nevada': {'2000': 3.4, '2001': 3.5},
        'Ohio': {'2000': 5.5, '2001': 2.3, '2002': 5.5}
      }

In [93]:
pop

{'Nevada': {'2000': 3.4, '2001': 3.5},
 'Ohio': {'2000': 5.5, '2001': 2.3, '2002': 5.5}}

In [94]:
df = pd.DataFrame(pop)

In [95]:
df

Unnamed: 0,Nevada,Ohio
2000,3.4,5.5
2001,3.5,2.3
2002,,5.5


In [97]:
obj3  = pd.Series(['blue', 'purple', 'yellow'], index = [0,3,5])

In [102]:
obj3.reindex(range(7),method='ffill')

0      blue
1      blue
2      blue
3    purple
4    purple
5    yellow
6    yellow
dtype: object

In [100]:
df

Unnamed: 0,Nevada,Ohio
2000,3.4,5.5
2001,3.5,2.3
2002,,5.5


In [103]:
df.reindex(columns = ['Ohio', 'California'])

Unnamed: 0,Ohio,California
2000,5.5,
2001,2.3,
2002,5.5,


In [104]:
df

Unnamed: 0,Nevada,Ohio
2000,3.4,5.5
2001,3.5,2.3
2002,,5.5


In [105]:
df.reindex(columns =['California', 'Ohio'],fill_value = 9)

Unnamed: 0,California,Ohio
2000,9,5.5
2001,9,2.3
2002,9,5.5


In [109]:
df

Unnamed: 0,Nevada,Ohio
2000,3.4,5.5
2001,3.5,2.3
2002,,5.5


In [113]:
df.reindex(columns =['California', 'Ohio'], copy=False)

Unnamed: 0,California,Ohio
2000,,5.5
2001,,2.3
2002,,5.5


In [2]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah' : 5000}

In [4]:
sdata2 = {'Ohio': [35000,45000,25000], 'Texas': [71000,50000,20000], 'Oregon': 16000, 'Utah' : [5000,15000,60000]}

In [8]:
new_df = pd.DataFrame(sdata2)

In [10]:
new_df.head()

Unnamed: 0,Ohio,Texas,Oregon,Utah
0,35000,71000,16000,5000
1,45000,50000,16000,15000
2,25000,20000,16000,60000


In [11]:
new_df.index

RangeIndex(start=0, stop=3, step=1)

In [14]:
new_df.loc[[2,1]]

Unnamed: 0,Ohio,Texas,Oregon,Utah
2,25000,20000,16000,60000
1,45000,50000,16000,15000


In [15]:
sdata

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [16]:
new_df

Unnamed: 0,Ohio,Texas,Oregon,Utah
0,35000,71000,16000,5000
1,45000,50000,16000,15000
2,25000,20000,16000,60000


In [22]:
new_df2 = pd.DataFrame(sdata, index=[0,1,2,0,0,1])

In [23]:
new_df2

Unnamed: 0,Ohio,Texas,Oregon,Utah
0,35000,71000,16000,5000
1,35000,71000,16000,5000
2,35000,71000,16000,5000
0,35000,71000,16000,5000
0,35000,71000,16000,5000
1,35000,71000,16000,5000


In [28]:
new_df2.loc[0]

Unnamed: 0,Ohio,Texas,Oregon,Utah
0,35000,71000,16000,5000
0,35000,71000,16000,5000
0,35000,71000,16000,5000


In [36]:
new_df2.index.is_monotonic_decreasing

False

In [2]:
header = ['a', 'b', 'c']

In [3]:
values = [['1', '2', '3'], ['1', '2', '3']]

In [4]:
header

['a', 'b', 'c']

In [5]:
values

[['1', '2', '3'], ['1', '2', '3']]

In [8]:
list(zip(*(header, values)))

[('a', ['1', '2', '3']), ('b', ['1', '2', '3'])]

In [15]:
list(zip(header, zip(*values)))

[('a', ('1', '1')), ('b', ('2', '2')), ('c', ('3', '3'))]

In [16]:
data_dict = {h:v for h,v in zip(header, zip(*values))}

In [18]:
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [20]:
list(zip(*values))

[('1', '1'), ('2', '2'), ('3', '3')]

In [21]:
list(zip(header, zip(*values)))

[('a', ('1', '1')), ('b', ('2', '2')), ('c', ('3', '3'))]

In [23]:
data = pd.DataFrame({'k1' : ['one', 'two'] * 3 + ['two'],
                    'k2': [1,4,2,3,3,4,4]})

In [24]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,4
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [25]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5     True
6     True
dtype: bool

In [26]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,4
2,one,2
3,two,3
4,one,3


In [27]:
data


Unnamed: 0,k1,k2
0,one,1
1,two,4
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [28]:
data['v1'] = range(7)

In [29]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,4,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [30]:
data.drop_duplicates('k1')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,4,1


In [31]:
data.duplicated('k1')

0    False
1    False
2     True
3     True
4     True
5     True
6     True
dtype: bool

In [34]:
data.duplicated('k1',keep='last')

0     True
1     True
2     True
3     True
4    False
5     True
6    False
dtype: bool

In [33]:
data.drop_duplicates('k1',keep='last')

Unnamed: 0,k1,k2,v1
4,one,3,4
6,two,4,6


In [35]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                             'Pastrami', 'corned beef', 'Bacon', 
                             'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4,3,12,6,7.5,8,3,5,6]})

In [36]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [37]:
meat_to_animal = {
    'bacon' : 'pig', 
    'pulled pork' : 'pig', 
    'pastrami': 'cow',
    'corned beef': 'cow', 
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [40]:
lowercased = data['food'].str.lower()

In [41]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [42]:
data['animal'] = lowercased.map(meat_to_animal)

In [43]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [44]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [45]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [46]:
data = np.random.randn(1000) #Normally distributed

In [47]:
cats = pd.cut(data, 4)

In [48]:
pd.value_counts(cats)

(-1.307, 0.204]     475
(0.204, 1.715]      390
(-2.824, -1.307]     89
(1.715, 3.226]       46
dtype: int64

In [3]:
df  = pd.DataFrame(np.random.randn(7,3))

In [4]:
df

Unnamed: 0,0,1,2
0,0.428804,-0.262492,0.746596
1,-0.009159,-0.96448,-0.504379
2,0.275848,0.396006,1.748347
3,-1.618112,-1.453277,-0.558604
4,0.071093,0.973038,-0.69252
5,-0.429191,2.640176,2.405671
6,-0.698094,-0.269514,-0.946636


In [6]:
from numpy import nan as NA

In [7]:
df.iloc[:4,1]= NA

In [8]:
df

Unnamed: 0,0,1,2
0,0.428804,,0.746596
1,-0.009159,,-0.504379
2,0.275848,,1.748347
3,-1.618112,,-0.558604
4,0.071093,0.973038,-0.69252
5,-0.429191,2.640176,2.405671
6,-0.698094,-0.269514,-0.946636


In [9]:
df.iloc[0,2]= NA
df.iloc[2,2] = NA

In [10]:
df

Unnamed: 0,0,1,2
0,0.428804,,
1,-0.009159,,-0.504379
2,0.275848,,
3,-1.618112,,-0.558604
4,0.071093,0.973038,-0.69252
5,-0.429191,2.640176,2.405671
6,-0.698094,-0.269514,-0.946636


In [11]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
1,-0.009159,,-0.504379
3,-1.618112,,-0.558604
4,0.071093,0.973038,-0.69252
5,-0.429191,2.640176,2.405671
6,-0.698094,-0.269514,-0.946636


In [12]:
df

Unnamed: 0,0,1,2
0,0.428804,,
1,-0.009159,,-0.504379
2,0.275848,,
3,-1.618112,,-0.558604
4,0.071093,0.973038,-0.69252
5,-0.429191,2.640176,2.405671
6,-0.698094,-0.269514,-0.946636


In [13]:
df.fillna({1: 5, 2: 4})

Unnamed: 0,0,1,2
0,0.428804,5.0,4.0
1,-0.009159,5.0,-0.504379
2,0.275848,5.0,4.0
3,-1.618112,5.0,-0.558604
4,0.071093,0.973038,-0.69252
5,-0.429191,2.640176,2.405671
6,-0.698094,-0.269514,-0.946636


In [14]:
df

Unnamed: 0,0,1,2
0,0.428804,,
1,-0.009159,,-0.504379
2,0.275848,,
3,-1.618112,,-0.558604
4,0.071093,0.973038,-0.69252
5,-0.429191,2.640176,2.405671
6,-0.698094,-0.269514,-0.946636


In [16]:
df.fillna(0, inplace=True)

In [17]:
df

Unnamed: 0,0,1,2
0,0.428804,0.0,0.0
1,-0.009159,0.0,-0.504379
2,0.275848,0.0,0.0
3,-1.618112,0.0,-0.558604
4,0.071093,0.973038,-0.69252
5,-0.429191,2.640176,2.405671
6,-0.698094,-0.269514,-0.946636


In [18]:
df = pd.DataFrame(np.random.randn(6,3))

In [19]:
df

Unnamed: 0,0,1,2
0,0.770862,-0.087363,-1.361171
1,-1.127598,0.89236,-1.863659
2,-1.06897,-0.996277,-0.364332
3,-1.938621,-0.880263,-0.035376
4,3.560948,-1.105801,-0.546737
5,1.264648,0.504557,-0.005544


In [20]:
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA

In [21]:
df

Unnamed: 0,0,1,2
0,0.770862,-0.087363,-1.361171
1,-1.127598,0.89236,-1.863659
2,-1.06897,,-0.364332
3,-1.938621,,-0.035376
4,3.560948,,
5,1.264648,,


In [25]:
df.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,0.770862,-0.087363,-1.361171
1,-1.127598,0.89236,-1.863659
2,-1.06897,0.89236,-0.364332
3,-1.938621,,-0.035376
4,3.560948,,-0.035376
5,1.264648,,


In [26]:
df

Unnamed: 0,0,1,2
0,0.770862,-0.087363,-1.361171
1,-1.127598,0.89236,-1.863659
2,-1.06897,,-0.364332
3,-1.938621,,-0.035376
4,3.560948,,
5,1.264648,,


In [27]:
data = pd.Series([1.,NA,3.5,NA,7])

In [28]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [32]:
data = pd.DataFrame({'k1' : ['one', 'two'] * 3 + ['two'],
                    'k2': [1,4,2,3,3,4,4]})

In [33]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,4
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [34]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5     True
6     True
dtype: bool

In [35]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,4
2,one,2
3,two,3
4,one,3


In [37]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,4
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [40]:
data['v1'] =  range(7)

In [41]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,4,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [44]:
data.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [53]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                             'Pastrami', 'corned beef', 'Bacon', 
                             'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4,3,12,6,7.5,8,3,5,6]})

In [54]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [55]:
meat_to_animal = {
    'bacon' : 'pig', 
    'pulled pork' : 'pig', 
    'pastrami': 'cow',
    'corned beef': 'cow', 
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [56]:
data['animal'] = data['food'].str.lower().map(meat_to_animal)

In [57]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [59]:
animal = data['food'].map(lambda x: meat_to_animal[x.lower()])

In [60]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [61]:
data = pd.Series([1.,-999.,2.,-999.,-1000.,3.])

In [62]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [65]:
data.replace(-999.,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [68]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [69]:
data.replace([-999.,-1000.], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [70]:
data.replace([-999.,-1000.], [np.nan,10])

0     1.0
1     NaN
2     2.0
3     NaN
4    10.0
5     3.0
dtype: float64

In [71]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [72]:
data.replace({-999.: np.nan, -1000. : 20})

0     1.0
1     NaN
2     2.0
3     NaN
4    20.0
5     3.0
dtype: float64

In [73]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                   index = ['Ohio', 'Colorado', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])

In [74]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [78]:
transform = lambda x: x[:4].upper()

In [79]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [80]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [81]:
data.index = data.index.map(transform)

In [82]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [83]:
data.rename(index = str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [84]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [85]:
data.rename(index={'OHIO' : 'INDIANA'},
           columns = {'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [86]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [87]:
data.rename(index = {
    'OHIO': 'INDIANA'
}, inplace = True)

In [88]:
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [90]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]

In [91]:
ages

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [93]:
bins= [18,25,35,69,100]

In [94]:
bins

[18, 25, 35, 69, 100]

In [95]:
cats = pd.cut(ages, bins)

In [96]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (35, 69], (35, 69], (35, 69], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 69] < (69, 100]]

In [97]:
pd.value_counts(cats)

(18, 25]     5
(35, 69]     4
(25, 35]     3
(69, 100]    0
dtype: int64

In [100]:
type(cats)

pandas.core.arrays.categorical.Categorical

In [101]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (35, 69], (35, 69], (35, 69], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 69] < (69, 100]]

In [102]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1], dtype=int8)

In [103]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 69], (69, 100]],
              closed='right',
              dtype='interval[int64]')

In [104]:
ages

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [105]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (35, 69], (35, 69], (35, 69], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 69] < (69, 100]]

In [106]:
pd.cut(ages, [18,26,36,61,100], right = False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [107]:
group_names = ['Youth', 'YoungAdult', 'MiddleAges', 'Senior']

In [108]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'MiddleAges', 'MiddleAges', 'MiddleAges', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAges' < 'Senior']

In [109]:
data = np.random.rand(20)

In [110]:
data

array([0.71588273, 0.06968744, 0.10808406, 0.17644121, 0.7489789 ,
       0.82363454, 0.11857382, 0.38332379, 0.32877333, 0.91020065,
       0.24603483, 0.31074366, 0.24299416, 0.52731214, 0.07758485,
       0.02601116, 0.18513692, 0.07843672, 0.72631185, 0.2416532 ])

In [111]:
pd.cut(data, 4, precision=2)

[(0.69, 0.91], (0.025, 0.25], (0.025, 0.25], (0.025, 0.25], (0.69, 0.91], ..., (0.025, 0.25], (0.025, 0.25], (0.025, 0.25], (0.69, 0.91], (0.025, 0.25]]
Length: 20
Categories (4, interval[float64]): [(0.025, 0.25] < (0.25, 0.47] < (0.47, 0.69] < (0.69, 0.91]]

In [112]:
data = np.random.randn(1000)

In [113]:
cats = pd.qcut(data, 4)

In [114]:
cats

[(-0.678, -0.031], (0.612, 3.369], (-0.678, -0.031], (-0.031, 0.612], (0.612, 3.369], ..., (0.612, 3.369], (-3.709, -0.678], (0.612, 3.369], (-3.709, -0.678], (-3.709, -0.678]]
Length: 1000
Categories (4, interval[float64]): [(-3.709, -0.678] < (-0.678, -0.031] < (-0.031, 0.612] < (0.612, 3.369]]

In [115]:
pd.value_counts(cats)

(0.612, 3.369]      250
(-0.031, 0.612]     250
(-0.678, -0.031]    250
(-3.709, -0.678]    250
dtype: int64

In [116]:
data = pd.DataFrame(np.random.randn(1000,4))

In [117]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.006062,0.022802,0.015871,-0.020239
std,1.012038,0.963657,1.019846,1.006077
min,-3.472599,-3.715663,-3.13083,-3.208978
25%,-0.658185,-0.6129,-0.632618,-0.675935
50%,-0.011086,0.034876,-0.005436,0.014652
75%,0.698539,0.660931,0.690048,0.669253
max,2.812977,2.588634,3.74844,3.250815


In [118]:
col = data[2]

In [119]:
col

0     -0.062500
1     -1.679435
2     -0.892980
3      0.604058
4     -2.179345
         ...   
995   -0.494619
996   -0.094404
997    1.798832
998   -0.231438
999    0.685416
Name: 2, Length: 1000, dtype: float64

In [120]:
col[np.abs(col)>3]

288   -3.130830
348    3.748440
415    3.058063
580    3.179005
652   -3.126701
Name: 2, dtype: float64

In [121]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
24,0.146495,0.559809,-1.886348,3.250815
38,-0.632178,1.623431,0.51108,3.219104
169,-1.220253,0.461796,1.674199,-3.208978
288,2.226862,-3.715663,-3.13083,-0.257697
348,1.467,-0.6874,3.74844,2.13955
415,0.713467,-0.933494,3.058063,-0.536434
499,-0.394325,-3.622326,1.700886,0.021275
505,-3.472599,-0.034567,1.462081,-1.437962
580,1.263111,-0.548234,3.179005,0.294056
652,0.906078,-0.177722,-3.126701,-0.633239


In [122]:
np.sign(-5)

-1

In [123]:
np.sign(0)

0

In [125]:
np.sign(5)

1

In [126]:
df = pd.DataFrame(np.arange(5*4).reshape((5,4)))

In [127]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [133]:
sampler = np.random.permutation(5)

In [134]:
sampler

array([3, 0, 1, 4, 2])

In [135]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3
3,12,13,14,15
0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11


In [136]:
df.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11


In [137]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [139]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
0,0,1,2,3


In [141]:
choices = pd.Series([5,7,-1,6,4])

In [142]:
draws = choices.sample(n=10, replace=True)

In [143]:
draws

2   -1
4    4
2   -1
2   -1
3    6
4    4
3    6
1    7
1    7
1    7
dtype: int64

In [150]:
dataframe = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1': range(6)})

In [151]:
dataframe

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [154]:
pd.get_dummies(dataframe['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [155]:
dummies = pd.get_dummies(dataframe['key'], prefix='key')

In [156]:
df_with_dummy = df[['data1']].join(dummies)

In [157]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [158]:
mnames = ['movie_id', 'title', 'genres']

In [159]:
movies = pd.read_table('pydata-book-2nd-edition/datasets/movielens/movies.dat',
                      sep = '::', header = None, names= mnames)

  return read_csv(**locals())


In [160]:
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [161]:
all_generes = []

In [162]:
for x in movies.genres:
    all_generes.extend(x.split('|'))

In [164]:
genres = pd.unique(all_generes)

In [165]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [167]:
zero_matrix = np.zeros((len(movies), len(genres)))

In [169]:
zero_matrix.shape

(3883, 18)

In [171]:
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [172]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [174]:
gen = movies.genres[0]

In [175]:
gen

"Animation|Children's|Comedy"

In [176]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [179]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

In [181]:
movies.genres

0        Animation|Children's|Comedy
1       Adventure|Children's|Fantasy
2                     Comedy|Romance
3                       Comedy|Drama
4                             Comedy
                    ...             
3878                          Comedy
3879                           Drama
3880                           Drama
3881                           Drama
3882                  Drama|Thriller
Name: genres, Length: 3883, dtype: object

In [182]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [183]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [185]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [186]:
np.random.seed(12345)

In [187]:
values = np.random.rand(10)

In [188]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [189]:
bins = [0.,0.2,0.4,0.6,0.8,1.]

In [191]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [193]:
val = 'a,b,    guido'

In [195]:
pieces = [x.strip() for x in val.split(',')]

In [196]:
pieces

['a', 'b', 'guido']

In [197]:
data = {'Dave': 'dave@google.com', 'Steve':'steve@gmail.com',
       'Rob':'rob@mgail.com', 'Wes': np.nan}


In [200]:
data = pd.Series(data)

In [201]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob      False
Wes        NaN
dtype: object

In [220]:
data = pd.Series(np.random.randn(9),
                index = [['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1,2,3,1,3,1,2,2,3]]
                )

In [221]:
data

a  1    0.852965
   2   -0.955869
   3   -0.023493
b  1   -2.304234
   3   -0.652469
c  1   -1.218302
   2   -1.332610
d  2    1.074623
   3    0.723642
dtype: float64

In [222]:
data['a'][2]

-0.9558688522944143

In [223]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [224]:
data

a  1    0.852965
   2   -0.955869
   3   -0.023493
b  1   -2.304234
   3   -0.652469
c  1   -1.218302
   2   -1.332610
d  2    1.074623
   3    0.723642
dtype: float64

In [225]:
data['b']

1   -2.304234
3   -0.652469
dtype: float64

In [226]:
data['b':'c']

b  1   -2.304234
   3   -0.652469
c  1   -1.218302
   2   -1.332610
dtype: float64

In [227]:
data

a  1    0.852965
   2   -0.955869
   3   -0.023493
b  1   -2.304234
   3   -0.652469
c  1   -1.218302
   2   -1.332610
d  2    1.074623
   3    0.723642
dtype: float64

In [228]:
data.unstack()

Unnamed: 0,1,2,3
a,0.852965,-0.955869,-0.023493
b,-2.304234,,-0.652469
c,-1.218302,-1.33261,
d,,1.074623,0.723642


In [229]:
data.unstack().stack()

a  1    0.852965
   2   -0.955869
   3   -0.023493
b  1   -2.304234
   3   -0.652469
c  1   -1.218302
   2   -1.332610
d  2    1.074623
   3    0.723642
dtype: float64

In [232]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
                    index = [['a','a','b','b'], [1,2,1,2]],
                     columns = [['Ohio', 'Ohio', 'Colorado'],
                               ['Green', 'Red', 'Green']]
                    )

In [233]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [234]:
frame.unstack()

Unnamed: 0_level_0,Ohio,Ohio,Ohio,Ohio,Colorado,Colorado
Unnamed: 0_level_1,Green,Green,Red,Red,Green,Green
Unnamed: 0_level_2,1,2,1,2,1,2
a,0,3,1,4,2,5
b,6,9,7,10,8,11


In [236]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [237]:
frame.index.names = ['key1', 'key2']

In [238]:
frame.columns.names = ['state', 'color']

In [239]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [241]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [242]:
from pandas import MultiIndex

In [243]:
MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],['Green','Red','Green']],
                      names=['state','color'])

MultiIndex([(    'Ohio', 'Green'),
            (    'Ohio',   'Red'),
            ('Colorado', 'Green')],
           names=['state', 'color'])

In [244]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [245]:
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [246]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [248]:
frame.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           names=['key1', 'key2'])

In [252]:
frame.sort_index(level = 1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [254]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [255]:
frame.swaplevel(0,1).sort_index(level = 0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [256]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [259]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [260]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [261]:
frame = pd.DataFrame({'a': range(7), 'b':range(7,0,-1),
                     'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                     'd': [0,1,2,9,1,2,3]})

In [262]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,9
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [263]:
frame2 = frame.set_index(keys=['c','d'])

In [264]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,9,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [265]:
frame.set_index(keys=['c','d'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,9,3,4,two,9
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [266]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,9,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [267]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,9,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [268]:
df1 = pd.DataFrame({'key':['b', 'b', 'a', 'c','a', 'a', 'b'],
                   'data1': range(7)})

In [269]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                   'data2': range(3)})

In [270]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [271]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [273]:
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [278]:
pd.merge(df1, df2, on ='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [279]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                  'data1': range(7)})

In [280]:
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                   'data2': range(3)})

In [281]:
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [282]:
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [286]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [287]:
df1  = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})

In [288]:
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                   'data2': range(5)})

In [289]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [290]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [291]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [292]:
left = pd.DataFrame({
    'key1': ['foo', 'foo', 'bar'],
    'key2': ['one', 'two', 'one'],
    'lval': [1,2,3]
})

In [293]:
right = pd.DataFrame({
    'key1':['foo', 'foo', 'bar', 'bar'],
    'key2': ['one', 'one', 'one', 'two'],
    'rval': [4,5,6,7]
})

In [294]:
left

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [295]:
right

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


In [296]:
pd.merge(left, right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [298]:
pd.merge(left, right, on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [299]:
pd.merge(left, right, on='key1', suffixes=('_left','_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [300]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                     'value' : range(6)})

In [301]:
right1 = pd.DataFrame({'group_val': [3.5,7]},
                     index = ['a', 'b'])

In [302]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [303]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [305]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [306]:
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [307]:
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',
                              'Nevada', 'Neada'],
                     'key2': [2000,2001,2002,2001,2002],
                     'data': np.arange(5.)})

In [308]:
righth =pd.DataFrame(np.arange(12).reshape((6,2)),
                    index = [['Nevada', 'Nevada', 'Ohio', 'Ohio',
                             'Ohio', 'Ohio'],
                            [2001,2000,2000,2000,2001,2002]],
                    columns = ['event1', 'event2'])

In [309]:
lefth

Unnamed: 0,key1,key2,data
0,Ohio,2000,0.0
1,Ohio,2001,1.0
2,Ohio,2002,2.0
3,Nevada,2001,3.0
4,Neada,2002,4.0


In [310]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [312]:
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4,5
0,Ohio,2000,0.0,6,7
1,Ohio,2001,1.0,8,9
2,Ohio,2002,2.0,10,11
3,Nevada,2001,3.0,0,1


In [313]:
pd.merge(lefth, righth, left_on=['key1','key2'],  right_index=True, how='outer')

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4.0,5.0
0,Ohio,2000,0.0,6.0,7.0
1,Ohio,2001,1.0,8.0,9.0
2,Ohio,2002,2.0,10.0,11.0
3,Nevada,2001,3.0,0.0,1.0
4,Neada,2002,4.0,,
4,Nevada,2000,,2.0,3.0


In [314]:
left2 = pd.DataFrame([[1., 2.], [3.,4.],[5.,6.]],
                    index = ['a', 'c', 'e'],
                    columns = ['Ohio', 'Nevada'])

In [315]:
right2 = pd.DataFrame([[7., 8.], [9.,10.], [11., 12.], [13,14]],
                     index = ['b','c', 'd', 'e'],
                     columns = ['MIssouri', 'Alabama'])

In [316]:
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [317]:
right2

Unnamed: 0,MIssouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [318]:
pd.merge(left2, right2, left_index=True, right_index=True, how='outer')

Unnamed: 0,Ohio,Nevada,MIssouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [319]:
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [320]:
right2

Unnamed: 0,MIssouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [321]:
left2.join(right2, how='outer')

Unnamed: 0,Ohio,Nevada,MIssouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [322]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [324]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [326]:
left1.join(right1, on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [327]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.],[16.,17.]],
                      index=['a', 'c', 'e', 'f'],
                      columns = ['New York', 'Oregon'])

In [328]:
another

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [331]:
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [332]:
right2

Unnamed: 0,MIssouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [330]:
left2.join([right2, another])

Unnamed: 0,Ohio,Nevada,MIssouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0


In [333]:
left2.join([right2, another], how='outer')

Unnamed: 0,Ohio,Nevada,MIssouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0
b,,,7.0,8.0,,
d,,,11.0,12.0,,
f,,,,,16.0,17.0


In [334]:
arr = np.arange(12.).reshape((3,4))

In [335]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [340]:
np.concatenate([arr,arr], axis = 1)

array([[ 0.,  1.,  2.,  3.,  0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.,  4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.,  8.,  9., 10., 11.]])

In [341]:
np.concatenate([arr,arr,arr], axis = 0)

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.],
       [ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.],
       [ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [342]:
s1 = pd.Series([0,1], index = ['a', 'b'])

In [343]:
s2 = pd.Series([2,3,4], index = ['c', 'd', 'e'])

In [344]:
s3 = pd.Series([5,6], index=['f', 'g'])

In [345]:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [346]:
pd.concat([s1,s2,s3], axis = 1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [347]:
s4 = pd.concat([s1,s3])

In [348]:
s4

a    0
b    1
f    5
g    6
dtype: int64

In [349]:
pd.concat([s1,s4], axis =1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [350]:
pd.concat([s1,s4], axis = 1, join='inner')

Unnamed: 0,0,1
a,0,0
b,1,1


In [351]:
result = pd.concat([s1,s2,s3], keys=['one', 'two', 'three'])

In [352]:
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

In [356]:
pd.concat([s1,s2,s3], axis = 1, keys=['one', 'two', 'three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [357]:
df1 = pd.DataFrame(np.arange(6).reshape(3,2), index = ['a', 'b', 'c'],
                  columns = ['one', 'two'])

In [358]:
df2 = pd.DataFrame(5+np.arange(4).reshape(2,2), index = ['a', 'c'],
                  columns = ['three', 'four'])

In [359]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [360]:
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [363]:
pd.concat([df1, df2], axis = 1,  keys = ['level1', 'level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [364]:
pd.concat({'level1': df1, 'level2': df2}, axis = 1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [365]:
df1 = pd.DataFrame(np.random.randn(3,4), columns= ['a', 'b', 'c', 'd'])

In [366]:
df2 = pd.DataFrame(np.random.randn(2,3), columns=['b', 'd', 'a'])

In [367]:
df1

Unnamed: 0,a,b,c,d
0,0.690002,1.001543,-0.503087,-0.622274
1,-0.921169,-0.726213,0.222896,0.051316
2,-1.157719,0.816707,0.43361,1.010737


In [368]:
df2

Unnamed: 0,b,d,a
0,1.824875,-0.997518,0.850591
1,-0.131578,0.912414,0.188211


In [369]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,a,b,c,d
0,0.690002,1.001543,-0.503087,-0.622274
1,-0.921169,-0.726213,0.222896,0.051316
2,-1.157719,0.816707,0.43361,1.010737
3,0.850591,1.824875,,-0.997518
4,0.188211,-0.131578,,0.912414


In [370]:
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
             index = ['f', 'e', 'd', 'c', 'b', 'a'])

In [371]:
b = pd.Series(np.arange(len(a), dtype=np.float64),
             index = ['f', 'e', 'd', 'c', 'b', 'a'])

In [372]:
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [373]:
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [374]:
b[-1] = np.nan

In [375]:
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [376]:
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

In [377]:
np.where(pd.isnull(a),b,a)

array([0. , 2.5, 2. , 3.5, 4.5, nan])

In [379]:
b[:-2]

f    0.0
e    1.0
d    2.0
c    3.0
dtype: float64

In [380]:
b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [381]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                   'b': [np.nan, 2., np.nan, 6.],
                   'c': range(2,18,4)})

In [382]:
df2 = pd.DataFrame({'a': [5., 5., np.nan, 3., 7.],
                   'b': [np.nan, 3., 4., 6., 8.]})

In [383]:
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [384]:
df2

Unnamed: 0,a,b
0,5.0,
1,5.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [385]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,5.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


In [386]:
data = pd.DataFrame(np.arange(6).reshape((2,3)),
                   index = pd.Index(['Ohio', 'Colorado'], name = 'state'),
                   columns = pd.Index(['one', 'two', 'three'],
                                     name = 'number'))

In [387]:
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [388]:
result = data.stack()

In [390]:
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [391]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [392]:
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [393]:
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [394]:
s1 = pd.Series([0,1,2,3], index = ['a', 'b', 'c', 'd'])

In [395]:
s2 = pd.Series([4,5,6], index=['c', 'd', 'e'])

In [396]:
data2 = pd.concat([s1, s2], keys = ['one', 'two'])

In [397]:
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [398]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [400]:
data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

In [401]:
df = pd.DataFrame({'left': result, 'right': result + 5}, 
                 columns = pd.Index(['left', 'right'], name = 'side'))

In [402]:
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [403]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [404]:
s1 = pd.Series([0,1,2,3], index= ['a', 'b', 'c', 'd'])

In [405]:
s2 = pd.Series([4,5,6], index=['c', 'd', 'e'])

In [406]:
data2 = pd.concat([s1, s2], keys = ['one', 'two'])

In [407]:
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [408]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [409]:
df = pd.DataFrame({'left': result, 'right': result + 5}, 
                 columns = pd.Index(['left', 'right'], name = 'side'))

In [410]:
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [411]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [412]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


In [413]:
import matplotlib.pyplot as plt

In [414]:
%matplotlib notebook

In [415]:
data = np.arange(10)

In [416]:
data

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [417]:
plt.plot(data)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x198efef5588>]

In [418]:
fig = plt.figure()
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)

<IPython.core.display.Javascript object>

In [419]:
plt.plot(np.random.randn(50).cumsum(), 'k--')

[<matplotlib.lines.Line2D at 0x198f0bd1a48>]

In [420]:
fig, axes = plt.subplots(2,3)

<IPython.core.display.Javascript object>

In [421]:
macro = pd.read_csv('pydata-book-2nd-edition/examples/macrodata.csv')

In [422]:
macro.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [425]:
data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]

In [426]:
data

Unnamed: 0,cpi,m1,tbilrate,unemp
0,28.980,139.7,2.82,5.8
1,29.150,141.7,3.08,5.1
2,29.350,140.5,3.82,5.3
3,29.370,140.0,4.33,5.6
4,29.540,139.6,3.50,5.2
...,...,...,...,...
198,216.889,1474.7,1.17,6.0
199,212.174,1576.5,0.12,6.9
200,212.671,1592.8,0.22,8.1
201,214.469,1653.6,0.18,9.2


In [427]:
trans_data = np.log(data).diff().dropna()

In [429]:
trans_data[-5:]

Unnamed: 0,cpi,m1,tbilrate,unemp
198,-0.007904,0.045361,-0.396881,0.105361
199,-0.021979,0.066753,-2.277267,0.139762
200,0.00234,0.010286,0.606136,0.160343
201,0.008419,0.037461,-0.200671,0.127339
202,0.008894,0.012202,-0.405465,0.04256


In [430]:
import seaborn as sns

In [431]:
fig = plt.figure()
sns.regplot('m1', 'unemp', data = trans_data)

<IPython.core.display.Javascript object>



<AxesSubplot:xlabel='m1', ylabel='unemp'>

In [434]:
sns.pairplot(trans_data, diag_kind='hist', plot_kws={'alpha': 0.2})

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x198f5699ec8>

In [2]:
df = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [3]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.863704,-0.93535
1,a,two,0.931855,-0.617428
2,b,one,-1.474662,-0.179682
3,b,two,-0.979452,-0.327887
4,a,one,1.053181,-0.554547


In [13]:
grouped = df['data1'].groupby(df['key1'])

In [14]:
grouped.mean()

key1
a    0.040444
b   -1.227057
Name: data1, dtype: float64

In [15]:
grouped2 = df['data1'].groupby([df['key1'], df['key2']])

In [16]:
grouped2.mean()

key1  key2
a     one    -0.405261
      two     0.931855
b     one    -1.474662
      two    -0.979452
Name: data1, dtype: float64

In [23]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])

In [24]:
years = np.array([2005,2005,2006,2005,2006])

In [25]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.863704,-0.93535
1,a,two,0.931855,-0.617428
2,b,one,-1.474662,-0.179682
3,b,two,-0.979452,-0.327887
4,a,one,1.053181,-0.554547


In [26]:
grouped3 = df['data1'].groupby([states, years])

In [29]:
grouped3.mean()

California  2005    0.931855
            2006   -1.474662
Ohio        2005   -1.421578
            2006    1.053181
Name: data1, dtype: float64

In [30]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.863704,-0.93535
1,a,two,0.931855,-0.617428
2,b,one,-1.474662,-0.179682
3,b,two,-0.979452,-0.327887
4,a,one,1.053181,-0.554547


In [32]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [33]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.863704,-0.93535
1,a,two,0.931855,-0.617428
2,b,one,-1.474662,-0.179682
3,b,two,-0.979452,-0.327887
4,a,one,1.053181,-0.554547


In [37]:
for name, group in df.groupby('key1'):
#     print(f"name: {name}")
#     print(f"group: {group}")
    print(f"{name}- {group}")

a-   key1 key2     data1     data2
0    a  one -1.863704 -0.935350
1    a  two  0.931855 -0.617428
4    a  one  1.053181 -0.554547
b-   key1 key2     data1     data2
2    b  one -1.474662 -0.179682
3    b  two -0.979452 -0.327887


In [39]:
for (n1,n2), group in df.groupby(['key1','key2']):
    print(f"{n1}:{n2} - {group}")

a:one -   key1 key2     data1     data2
0    a  one -1.863704 -0.935350
4    a  one  1.053181 -0.554547
a:two -   key1 key2     data1     data2
1    a  two  0.931855 -0.617428
b:one -   key1 key2     data1     data2
2    b  one -1.474662 -0.179682
b:two -   key1 key2     data1     data2
3    b  two -0.979452 -0.327887


In [40]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.863704,-0.93535
1,a,two,0.931855,-0.617428
2,b,one,-1.474662,-0.179682
3,b,two,-0.979452,-0.327887
4,a,one,1.053181,-0.554547


In [41]:
pieces = dict(list(df.groupby('key1')))

In [42]:
pieces

{'a':   key1 key2     data1     data2
 0    a  one -1.863704 -0.935350
 1    a  two  0.931855 -0.617428
 4    a  one  1.053181 -0.554547, 'b':   key1 key2     data1     data2
 2    b  one -1.474662 -0.179682
 3    b  two -0.979452 -0.327887}

In [44]:
dict(list(df.groupby('key1')))

{'a':   key1 key2     data1     data2
 0    a  one -1.863704 -0.935350
 1    a  two  0.931855 -0.617428
 4    a  one  1.053181 -0.554547, 'b':   key1 key2     data1     data2
 2    b  one -1.474662 -0.179682
 3    b  two -0.979452 -0.327887}

In [45]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-1.474662,-0.179682
3,b,two,-0.979452,-0.327887


In [46]:
pieces['a']

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.863704,-0.93535
1,a,two,0.931855,-0.617428
4,a,one,1.053181,-0.554547


In [47]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [48]:
grouped = df.groupby(df.dtypes, axis = 1)

In [53]:
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -1.863704 -0.935350
 1  0.931855 -0.617428
 2 -1.474662 -0.179682
 3 -0.979452 -0.327887
 4  1.053181 -0.554547, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [60]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.863704,-0.93535
1,a,two,0.931855,-0.617428
2,b,one,-1.474662,-0.179682
3,b,two,-0.979452,-0.327887
4,a,one,1.053181,-0.554547


In [62]:
df['data1'].groupby(df['key1']).mean()

key1
a    0.040444
b   -1.227057
Name: data1, dtype: float64

In [64]:
df.groupby(df['key1'])['data1'].mean()

key1
a    0.040444
b   -1.227057
Name: data1, dtype: float64

In [65]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.863704,-0.93535
1,a,two,0.931855,-0.617428
2,b,one,-1.474662,-0.179682
3,b,two,-0.979452,-0.327887
4,a,one,1.053181,-0.554547


In [70]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.744948
a,two,-0.617428
b,one,-0.179682
b,two,-0.327887


In [71]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.863704,-0.93535
1,a,two,0.931855,-0.617428
2,b,one,-1.474662,-0.179682
3,b,two,-0.979452,-0.327887
4,a,one,1.053181,-0.554547


In [72]:
s_grouped = df.groupby(['key1', 'key2'])[['data2']].mean()

In [73]:
s_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.744948
a,two,-0.617428
b,one,-0.179682
b,two,-0.327887


In [74]:
people= pd.DataFrame(np.random.randn(5,5),
                    columns = ['a', 'b', 'c', 'd', 'e'],
                    index = ['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

In [75]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.359031,-0.925894,-0.914579,-0.592197,0.412886
Steve,-0.484773,1.593077,1.179107,-1.093002,-0.359124
Wes,0.047319,0.112646,-0.111642,0.954856,-0.312909
Jim,0.568368,-1.680978,-0.347616,1.94824,-0.602898
Travis,1.042241,-0.982923,0.257114,-0.481697,0.516824


In [76]:
people.iloc[2:3, [1,2]]

Unnamed: 0,b,c
Wes,0.112646,-0.111642


In [87]:
people.loc[['Steve', 'Wes'], ['b','c']]

Unnamed: 0,b,c
Steve,1.593077,1.179107
Wes,0.112646,-0.111642


In [91]:
people.loc['Steve':'Jim', 'b':'d']

Unnamed: 0,b,c,d
Steve,1.593077,1.179107,-1.093002
Wes,0.112646,-0.111642,0.954856
Jim,-1.680978,-0.347616,1.94824


In [92]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.359031,-0.925894,-0.914579,-0.592197,0.412886
Steve,-0.484773,1.593077,1.179107,-1.093002,-0.359124
Wes,0.047319,0.112646,-0.111642,0.954856,-0.312909
Jim,0.568368,-1.680978,-0.347616,1.94824,-0.602898
Travis,1.042241,-0.982923,0.257114,-0.481697,0.516824


In [93]:
people.iloc[2:3, [1,2]]

Unnamed: 0,b,c
Wes,0.112646,-0.111642


In [94]:
people.iloc[2:3, [1,2]] =  np.nan

In [95]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.359031,-0.925894,-0.914579,-0.592197,0.412886
Steve,-0.484773,1.593077,1.179107,-1.093002,-0.359124
Wes,0.047319,,,0.954856,-0.312909
Jim,0.568368,-1.680978,-0.347616,1.94824,-0.602898
Travis,1.042241,-0.982923,0.257114,-0.481697,0.516824


In [96]:
mapping = {
    'a': 'red',
    'b': 'red', 
    'c': 'blue', 
    'd': 'blue', 
    'e': 'red',
    'f': 'orange'
}

In [98]:
by_column = people.groupby(mapping,axis=1)

In [100]:
by_column.size()

blue    2
red     3
dtype: int64

In [102]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-1.506776,-0.872039
Steve,0.086105,0.749179
Wes,0.954856,-0.26559
Jim,1.600624,-1.715507
Travis,-0.224583,0.576142


In [103]:
map_series = pd.Series(mapping)

In [104]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [106]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.359031,-0.925894,-0.914579,-0.592197,0.412886
Steve,-0.484773,1.593077,1.179107,-1.093002,-0.359124
Wes,0.047319,,,0.954856,-0.312909
Jim,0.568368,-1.680978,-0.347616,1.94824,-0.602898
Travis,1.042241,-0.982923,0.257114,-0.481697,0.516824


In [105]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [107]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.359031,-0.925894,-0.914579,-0.592197,0.412886
Steve,-0.484773,1.593077,1.179107,-1.093002,-0.359124
Wes,0.047319,,,0.954856,-0.312909
Jim,0.568368,-1.680978,-0.347616,1.94824,-0.602898
Travis,1.042241,-0.982923,0.257114,-0.481697,0.516824


In [109]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.256656,-2.606872,-1.262195,2.310899,-0.50292
5,-0.484773,1.593077,1.179107,-1.093002,-0.359124
6,1.042241,-0.982923,0.257114,-0.481697,0.516824


In [110]:
key_list = ['one', 'one', 'one', 'two', 'two']

In [111]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.359031,-0.925894,-0.914579,-0.592197,-0.312909
3,two,0.568368,-1.680978,-0.347616,1.94824,-0.602898
5,one,-0.484773,1.593077,1.179107,-1.093002,-0.359124
6,two,1.042241,-0.982923,0.257114,-0.481697,0.516824


In [112]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1,3,5,1,3]],
                                   names = ['city', 'tenor'])

In [114]:
columns

MultiIndex([('US', 1),
            ('US', 3),
            ('US', 5),
            ('JP', 1),
            ('JP', 3)],
           names=['city', 'tenor'])

In [115]:
hier_df = pd.DataFrame(np.random.randn(4,5), columns = columns)

In [116]:
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.591165,0.316431,0.155095,0.011442,-1.508263
1,-0.426253,0.365988,0.387498,0.142233,-0.115713
2,2.15487,0.11111,-0.658173,-0.874129,3.265463
3,0.186113,-0.592396,-1.052681,-0.912316,0.798161


In [121]:
hier_df.groupby(level='tenor', axis=1).sum()

tenor,1,3,5
0,0.602606,-1.191832,0.155095
1,-0.28402,0.250275,0.387498
2,1.280741,3.376573,-0.658173
3,-0.726203,0.205765,-1.052681


In [122]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.863704,-0.93535
1,a,two,0.931855,-0.617428
2,b,one,-1.474662,-0.179682
3,b,two,-0.979452,-0.327887
4,a,one,1.053181,-0.554547


In [123]:
grouped = df.groupby('key1')

In [128]:
grouped['data1'].quantile(0.9)

key1
a    1.028916
b   -1.028973
Name: data1, dtype: float64

In [129]:
def peak_to_peak(arr):
    return arr.max()-arr.min()

In [130]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.916885,0.380803
b,0.49521,0.148205


In [131]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.040444,1.650156,-1.863704,-0.465924,0.931855,0.992518,1.053181,3.0,-0.702441,0.20414,-0.93535,-0.776389,-0.617428,-0.585987,-0.554547
b,2.0,-1.227057,0.350166,-1.474662,-1.350859,-1.227057,-1.103254,-0.979452,2.0,-0.253785,0.104797,-0.327887,-0.290836,-0.253785,-0.216734,-0.179682


In [132]:
tips = pd.read_csv('pydata-book-2nd-edition/examples/tips.csv')

In [133]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

In [135]:
tips[:10]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624
6,8.77,2.0,No,Sun,Dinner,2,0.22805
7,26.88,3.12,No,Sun,Dinner,4,0.116071
8,15.04,1.96,No,Sun,Dinner,2,0.130319
9,14.78,3.23,No,Sun,Dinner,2,0.218539


In [136]:
grouped = tips.groupby(['tip', 'smoker'])

In [137]:
grouped_pct = grouped['tip_pct']

In [138]:
grouped_pct.agg('mean')

tip    smoker
1.00   No        0.137931
       Yes       0.193004
1.01   No        0.059447
1.10   Yes       0.085271
1.17   Yes       0.035638
                   ...   
6.70   No        0.195335
6.73   No        0.139424
7.58   No        0.192288
9.00   No        0.186220
10.00  Yes       0.196812
Name: tip_pct, Length: 141, dtype: float64

In [141]:
grouped_pct.agg([('foo','mean'), ('bar','std'), ('pk',peak_to_peak)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar,pk
tip,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.00,No,0.137931,,0.000000
1.00,Yes,0.193004,0.124288,0.246368
1.01,No,0.059447,,0.000000
1.10,Yes,0.085271,,0.000000
1.17,Yes,0.035638,,0.000000
...,...,...,...,...
6.70,No,0.195335,,0.000000
6.73,No,0.139424,,0.000000
7.58,No,0.192288,,0.000000
9.00,No,0.186220,,0.000000


In [142]:
functions = ['count', 'mean', 'max']

In [143]:
result = grouped['tip_pct', 'total_bill'].agg(functions)

  """Entry point for launching an IPython kernel.


In [144]:
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
tip,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1.00,No,1,0.137931,0.137931,1,7.25,7.25
1.00,Yes,3,0.193004,0.325733,3,7.14,12.60
1.01,No,1,0.059447,0.059447,1,16.99,16.99
1.10,Yes,1,0.085271,0.085271,1,12.90,12.90
1.17,Yes,1,0.035638,0.035638,1,32.83,32.83
...,...,...,...,...,...,...,...
6.70,No,1,0.195335,0.195335,1,34.30,34.30
6.73,No,1,0.139424,0.139424,1,48.27,48.27
7.58,No,1,0.192288,0.192288,1,39.42,39.42
9.00,No,1,0.186220,0.186220,1,48.33,48.33


In [148]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
tip,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.00,No,1,0.137931,0.137931
1.00,Yes,3,0.193004,0.325733
1.01,No,1,0.059447,0.059447
1.10,Yes,1,0.085271,0.085271
1.17,Yes,1,0.035638,0.035638
...,...,...,...,...
6.70,No,1,0.195335,0.195335
6.73,No,1,0.139424,0.139424
7.58,No,1,0.192288,0.192288
9.00,No,1,0.186220,0.186220


In [150]:
grouped.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,size,tip_pct
tip,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.00,No,7.25,1.000000,0.137931
1.00,Yes,7.14,1.666667,0.193004
1.01,No,16.99,2.000000,0.059447
1.10,Yes,12.90,2.000000,0.085271
1.17,Yes,32.83,2.000000,0.035638
...,...,...,...,...
6.70,No,34.30,6.000000,0.195335
6.73,No,48.27,4.000000,0.139424
7.58,No,39.42,4.000000,0.192288
9.00,No,48.33,4.000000,0.186220


In [151]:
grouped.agg({'tip': np.max, 'size': 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
tip,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
1.00,No,1.00,1
1.00,Yes,1.00,5
1.01,No,1.01,2
1.10,Yes,1.10,2
1.17,Yes,1.17,2
...,...,...,...
6.70,No,6.70,6
6.73,No,6.73,4
7.58,No,7.58,4
9.00,No,9.00,4


In [172]:
frame = pd.DataFrame({
    'data1': np.random.randn(1000),
    'data2': np.random.randn(1000)
})

In [173]:
frame

Unnamed: 0,data1,data2
0,-0.353440,1.964974
1,1.356128,0.447628
2,-0.368136,-0.341970
3,-0.022097,1.344991
4,0.396337,0.386976
...,...,...
995,1.009347,-0.116755
996,-0.374717,-0.678840
997,0.055518,0.808868
998,1.678930,-0.980496


In [174]:
quartiles = pd.cut(frame.data1,4)

In [175]:
quartiles

0      (-0.418, 1.186]
1        (1.186, 2.79]
2      (-0.418, 1.186]
3      (-0.418, 1.186]
4      (-0.418, 1.186]
            ...       
995    (-0.418, 1.186]
996    (-0.418, 1.186]
997    (-0.418, 1.186]
998      (1.186, 2.79]
999    (-0.418, 1.186]
Name: data1, Length: 1000, dtype: category
Categories (4, interval[float64]): [(-3.631, -2.021] < (-2.021, -0.418] < (-0.418, 1.186] < (1.186, 2.79]]

In [176]:
def get_stats(group):
    return {
        'min': group.min(),
        'max': group.max(),
        'co
        unt': group.count(),
        'mean': group.mean()
    }


In [177]:
grouped = frame.data1.groupby(quartiles)

In [180]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.631, -2.021]",-3.624737,-2.032254,13.0,-2.390853
"(-2.021, -0.418]",-2.011341,-0.427669,295.0,-0.960519
"(-0.418, 1.186]",-0.417386,1.177488,564.0,0.315005
"(1.186, 2.79]",1.189226,2.789591,128.0,1.619868


In [181]:
frame

Unnamed: 0,data1,data2
0,-0.353440,1.964974
1,1.356128,0.447628
2,-0.368136,-0.341970
3,-0.022097,1.344991
4,0.396337,0.386976
...,...,...
995,1.009347,-0.116755
996,-0.374717,-0.678840
997,0.055518,0.808868
998,1.678930,-0.980496


In [182]:
grouping = pd.qcut(frame.data1, 10,labels = False)

In [187]:
grouping.value_counts

<bound method IndexOpsMixin.value_counts of 0      3
1      9
2      3
3      4
4      6
      ..
995    8
996    3
997    5
998    9
999    5
Name: data1, Length: 1000, dtype: int64>

In [185]:
grouped = frame.data2.groupby(grouping)

In [186]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-2.88941,2.567668,100.0,-0.180117
1,-2.234899,2.153446,100.0,0.129112
2,-2.15494,2.173216,100.0,0.011188
3,-2.921947,1.964974,100.0,-0.009411
4,-2.519584,2.863753,100.0,0.081412
5,-2.336837,2.518377,100.0,0.146892
6,-2.5617,2.84554,100.0,0.142292
7,-1.945087,3.292814,100.0,0.08736
8,-2.799528,2.952368,100.0,-0.241157
9,-2.867378,2.598417,100.0,0.091831


<h1>Time Series</h1>

In [2]:
import numpy as np
import pandas as pd

In [3]:
from datetime import datetime

In [4]:
now = datetime.now()

In [5]:
now

datetime.datetime(2021, 3, 26, 8, 42, 56, 611580)

In [6]:
now.year

2021

In [7]:
now.month

3

In [8]:
now.day

26

In [9]:
now.year, now.month, now.day

(2021, 3, 26)

<b>timedelta</b> represents the temporal difference between two datetime objects:

In [10]:
delta  = datetime(2011,1,7) - datetime(2008,6,24,8,15)

In [11]:
delta

datetime.timedelta(days=926, seconds=56700)

In [12]:
delta.days, delta.seconds

(926, 56700)

In [13]:
type(datetime.now())

datetime.datetime

In [14]:
type(delta)

datetime.timedelta

In [15]:
from datetime import timedelta

In [16]:
start = datetime(2011,1,7)

In [17]:
type(start)

datetime.datetime

In [19]:
start + timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [20]:
start - 2 * timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

In [22]:
stamp = datetime(2011,1,3)

In [23]:
stamp

datetime.datetime(2011, 1, 3, 0, 0)

In [24]:
str(stamp)

'2011-01-03 00:00:00'

In [25]:
stamp.strftime("%Y-%m-%d")

'2011-01-03'

In [26]:
value = '2011-01-03'

In [30]:
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [31]:
datestrs = ['7/6/2011', '8/6/2011']

In [32]:
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

In [33]:
from dateutil.parser import parse

In [34]:
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [35]:
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [36]:
parse('6/12/2011', dayfirst = True)

datetime.datetime(2011, 12, 6, 0, 0)

In [37]:
datestrs = ['2011-07-06 12:00:00',  '2011-08-06 00:00:00']

In [38]:
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

In [39]:
idx = pd.to_datetime(datestrs + [None])

In [40]:
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [41]:
parse('29 March 2021 12:00:00 AM')

datetime.datetime(2021, 3, 29, 0, 0)

In [43]:
now = datetime.now()

In [44]:
now 

datetime.datetime(2021, 3, 26, 9, 27, 53, 615780)

In [45]:
now + timedelta(days = 2, hours = 14)

datetime.datetime(2021, 3, 28, 23, 27, 53, 615780)

In [46]:
idx = pd.to_datetime(datestrs + [None])

In [47]:
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [48]:
idx[2]

NaT

In [49]:
pd.isnull(idx[2])

True

In [50]:
pd.isnull(idx)

array([False, False,  True])

In [51]:
dates  = [
    datetime(2011,1,2), datetime(2011,1,5),
    datetime(2011,1,7), datetime(2011,1,8),
    datetime(2011,1,10), datetime(2011,1,12)
]

In [52]:
ts = pd.Series(np.random.randn(6), index = dates)

In [53]:
ts

2011-01-02    2.805302
2011-01-05    0.726154
2011-01-07    1.170842
2011-01-08    1.731764
2011-01-10   -1.446217
2011-01-12    0.133618
dtype: float64

In [54]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [56]:
ts[::2]

2011-01-02    2.805302
2011-01-07    1.170842
2011-01-10   -1.446217
dtype: float64

In [57]:
ts + ts[::2]

2011-01-02    5.610604
2011-01-05         NaN
2011-01-07    2.341685
2011-01-08         NaN
2011-01-10   -2.892434
2011-01-12         NaN
dtype: float64

In [59]:
ts.index.dtype

dtype('<M8[ns]')

In [61]:
stamp=ts.index[0]

In [62]:
stamp

Timestamp('2011-01-02 00:00:00')

In [63]:
stamp = ts.index[2]

In [64]:
stamp

Timestamp('2011-01-07 00:00:00')

In [65]:
ts

2011-01-02    2.805302
2011-01-05    0.726154
2011-01-07    1.170842
2011-01-08    1.731764
2011-01-10   -1.446217
2011-01-12    0.133618
dtype: float64

In [66]:
ts[stamp]

1.170842264190419

In [67]:
ts['1/10/2011']

-1.446217138405679

In [68]:
longer_ts = pd.Series(np.random.randn(1000), 
                     index = pd.date_range('1/1/2000', periods = 1000))

In [69]:
longer_ts

2000-01-01    2.799478
2000-01-02    1.758983
2000-01-03    1.070903
2000-01-04    0.998386
2000-01-05    0.605289
                ...   
2002-09-22    0.757012
2002-09-23    0.666724
2002-09-24    0.822526
2002-09-25   -2.264295
2002-09-26   -1.515041
Freq: D, Length: 1000, dtype: float64

In [70]:
longer_ts['2001']

2001-01-01   -0.886638
2001-01-02   -0.745005
2001-01-03   -0.671671
2001-01-04   -0.363357
2001-01-05   -1.084059
                ...   
2001-12-27   -1.268228
2001-12-28    0.440659
2001-12-29    0.982075
2001-12-30    0.496634
2001-12-31    0.029949
Freq: D, Length: 365, dtype: float64

In [72]:
longer_ts['2001-05']

2001-05-01   -0.906154
2001-05-02   -0.156108
2001-05-03    0.893806
2001-05-04    0.455296
2001-05-05   -1.596238
2001-05-06    0.197045
2001-05-07    1.396461
2001-05-08    1.195038
2001-05-09    1.572306
2001-05-10   -0.955620
2001-05-11   -1.042671
2001-05-12   -1.315182
2001-05-13    0.362656
2001-05-14    0.320782
2001-05-15    0.346669
2001-05-16   -1.817095
2001-05-17    0.424134
2001-05-18   -0.451409
2001-05-19   -0.802196
2001-05-20   -1.060092
2001-05-21   -0.282661
2001-05-22    0.052737
2001-05-23    0.611813
2001-05-24   -0.881240
2001-05-25   -1.161392
2001-05-26    0.904522
2001-05-27    0.033724
2001-05-28    1.417831
2001-05-29   -0.287362
2001-05-30   -0.539099
2001-05-31   -1.358437
Freq: D, dtype: float64

In [73]:
ts

2011-01-02    2.805302
2011-01-05    0.726154
2011-01-07    1.170842
2011-01-08    1.731764
2011-01-10   -1.446217
2011-01-12    0.133618
dtype: float64

In [74]:
ts[datetime(2011,1,7):]

2011-01-07    1.170842
2011-01-08    1.731764
2011-01-10   -1.446217
2011-01-12    0.133618
dtype: float64

In [75]:
ts

2011-01-02    2.805302
2011-01-05    0.726154
2011-01-07    1.170842
2011-01-08    1.731764
2011-01-10   -1.446217
2011-01-12    0.133618
dtype: float64

In [76]:
ts['1/6/2011': '1/11/2011']

2011-01-07    1.170842
2011-01-08    1.731764
2011-01-10   -1.446217
dtype: float64

In [79]:
ts[datetime(2011,1,6):datetime(2011,1,11)]

2011-01-07    1.170842
2011-01-08    1.731764
2011-01-10   -1.446217
dtype: float64

In [80]:
ts

2011-01-02    2.805302
2011-01-05    0.726154
2011-01-07    1.170842
2011-01-08    1.731764
2011-01-10   -1.446217
2011-01-12    0.133618
dtype: float64

In [81]:
ts.truncate(after='1/9/2011')

2011-01-02    2.805302
2011-01-05    0.726154
2011-01-07    1.170842
2011-01-08    1.731764
dtype: float64

In [82]:
dates = pd.date_range('1/1/2000', periods = 100, freq='W-WED')

In [83]:
long_df = pd.DataFrame(np.random.randn(100,4), 
                      index = dates,
                      columns = ['Colorado', 'Texas', 'New York', 'Ohio'])

In [84]:
long_df

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.743581,0.929669,1.849847,0.555137
2000-01-12,-1.264048,-2.958110,0.356731,1.270586
2000-01-19,0.862725,0.749359,0.407087,-0.339360
2000-01-26,-0.014673,1.365473,-0.632747,0.600996
2000-02-02,2.121345,0.889332,0.486982,-0.031714
...,...,...,...,...
2001-10-31,-0.246570,1.296153,-1.173605,1.307924
2001-11-07,-0.391968,2.054442,-0.573583,-0.202464
2001-11-14,-0.177216,-0.140838,-0.476051,0.819174
2001-11-21,-2.153745,-0.129930,1.184476,1.996590


In [91]:
long_df.loc['5-2001']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.171642,-0.94885,-0.71689,0.711062
2001-05-09,0.957391,-1.447376,0.923881,-0.363817
2001-05-16,0.4019,-0.444813,-1.928321,-0.239915
2001-05-23,-0.190813,1.700462,-0.804239,-1.047613
2001-05-30,0.670451,-0.796955,-0.172609,-0.269949


In [92]:
long_df.loc['6-2001']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-06-06,0.88196,0.254552,-0.83985,0.638704
2001-06-13,0.368749,-0.541932,-0.845652,-1.593746
2001-06-20,0.406381,-2.006162,0.139939,1.565559
2001-06-27,0.933774,-0.837663,0.5219,-0.706138


In [93]:
dates = pd.DatetimeIndex(['1/1/2000','1/2/2000','1/2/2000',
                         '1/2/2000','1/3/2000'])

In [94]:
dup_ts = pd.Series(np.arange(5), index = dates)

In [95]:
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [98]:
dup_ts.index.is_unique

False

In [99]:
dup_ts['1/2/2000']

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

In [100]:
grouped = dup_ts.groupby(level=0)

In [102]:
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [103]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

In [104]:
ts

2011-01-02    2.805302
2011-01-05    0.726154
2011-01-07    1.170842
2011-01-08    1.731764
2011-01-10   -1.446217
2011-01-12    0.133618
dtype: float64

In [105]:
resampler = ts.resample('D')

In [106]:
list(resampler)

[(Timestamp('2011-01-02 00:00:00', freq='D'), 2011-01-02    2.805302
  dtype: float64),
 (Timestamp('2011-01-03 00:00:00', freq='D'), Series([], dtype: float64)),
 (Timestamp('2011-01-04 00:00:00', freq='D'), Series([], dtype: float64)),
 (Timestamp('2011-01-05 00:00:00', freq='D'), 2011-01-05    0.726154
  dtype: float64),
 (Timestamp('2011-01-06 00:00:00', freq='D'), Series([], dtype: float64)),
 (Timestamp('2011-01-07 00:00:00', freq='D'), 2011-01-07    1.170842
  dtype: float64),
 (Timestamp('2011-01-08 00:00:00', freq='D'), 2011-01-08    1.731764
  dtype: float64),
 (Timestamp('2011-01-09 00:00:00', freq='D'), Series([], dtype: float64)),
 (Timestamp('2011-01-10 00:00:00', freq='D'), 2011-01-10   -1.446217
  dtype: float64),
 (Timestamp('2011-01-11 00:00:00', freq='D'), Series([], dtype: float64)),
 (Timestamp('2011-01-12 00:00:00', freq='D'), 2011-01-12    0.133618
  dtype: float64)]

In [107]:
index = pd.date_range('2012-04-01', '2012-06-01')

In [108]:
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

In [109]:
pd.date_range(end='2012-06-01', periods = 20)

DatetimeIndex(['2012-05-13', '2012-05-14', '2012-05-15', '2012-05-16',
               '2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',
               '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',
               '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28',
               '2012-05-29', '2012-05-30', '2012-05-31', '2012-06-01'],
              dtype='datetime64[ns]', freq='D')

In [110]:
pd.date_range('2000-01-01', '2000-12-01', freq='BM')

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')

In [111]:
pd.date_range('2012-5-02 12:56:31', periods=5)

DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [112]:
pd.date_range('2012-05-02 12:56:31', periods = 5, normalize=True)

DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

In [113]:
from pandas.tseries.offsets import Hour, Minute

In [114]:
hour= Hour()

In [115]:
hour

<Hour>

In [116]:
four_hours = Hour(4)

In [117]:
four_hours

<4 * Hours>

In [118]:
pd.date_range('2000-01-01', '2000-01-03', freq='4h')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [119]:
Hour(2) + Minute(30)

<150 * Minutes>

In [120]:
pd.date_range('2000-01-01', periods=10, freq='1h30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

In [121]:
rng = pd.date_range('2012-01-01', '2012-09-01', freq='WOM-3FRI')

In [122]:
rng

DatetimeIndex(['2012-01-20', '2012-02-17', '2012-03-16', '2012-04-20',
               '2012-05-18', '2012-06-15', '2012-07-20', '2012-08-17'],
              dtype='datetime64[ns]', freq='WOM-3FRI')

In [123]:
list(rng)

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]

In [124]:
ts = pd.Series(np.random.randn(4), 
              index = pd.date_range('1/1/2000', periods=4, freq='M'))

In [125]:
ts

2000-01-31    0.756378
2000-02-29   -0.526569
2000-03-31   -2.033405
2000-04-30   -0.632540
Freq: M, dtype: float64

In [126]:
ts.shift(2)

2000-01-31         NaN
2000-02-29         NaN
2000-03-31    0.756378
2000-04-30   -0.526569
Freq: M, dtype: float64

In [127]:
ts

2000-01-31    0.756378
2000-02-29   -0.526569
2000-03-31   -2.033405
2000-04-30   -0.632540
Freq: M, dtype: float64

In [128]:
ts.shift(-2)

2000-01-31   -2.033405
2000-02-29   -0.632540
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

In [129]:
ts

2000-01-31    0.756378
2000-02-29   -0.526569
2000-03-31   -2.033405
2000-04-30   -0.632540
Freq: M, dtype: float64

In [130]:
ts/ts.shift(1) - 1

2000-01-31         NaN
2000-02-29   -1.696172
2000-03-31    2.861610
2000-04-30   -0.688926
Freq: M, dtype: float64

In [131]:
ts

2000-01-31    0.756378
2000-02-29   -0.526569
2000-03-31   -2.033405
2000-04-30   -0.632540
Freq: M, dtype: float64

In [132]:
ts.shift(2, freq='M')

2000-03-31    0.756378
2000-04-30   -0.526569
2000-05-31   -2.033405
2000-06-30   -0.632540
Freq: M, dtype: float64

In [133]:
ts

2000-01-31    0.756378
2000-02-29   -0.526569
2000-03-31   -2.033405
2000-04-30   -0.632540
Freq: M, dtype: float64

In [134]:
ts.shift(3, freq='D')

2000-02-03    0.756378
2000-03-03   -0.526569
2000-04-03   -2.033405
2000-05-03   -0.632540
dtype: float64

In [135]:
ts.shift(1, freq='90T')

2000-01-31 01:30:00    0.756378
2000-02-29 01:30:00   -0.526569
2000-03-31 01:30:00   -2.033405
2000-04-30 01:30:00   -0.632540
dtype: float64

In [136]:
from pandas.tseries.offsets import Day, MonthEnd

In [137]:
now = datetime(2011,11,17)

In [138]:
now + 3 * Day()

Timestamp('2011-11-20 00:00:00')

In [139]:
now

datetime.datetime(2011, 11, 17, 0, 0)

In [140]:
now + MonthEnd(2)

Timestamp('2011-12-31 00:00:00')

In [141]:
offset = MonthEnd()

In [143]:
offset

<MonthEnd>

In [144]:
now

datetime.datetime(2011, 11, 17, 0, 0)

In [145]:
offset.rollforward(now)

Timestamp('2011-11-30 00:00:00')

In [146]:
offset.rollback(now)

Timestamp('2011-10-31 00:00:00')

In [147]:
ts = pd.Series(np.random.randn(20),
              index = pd.date_range('1/15/2000', periods = 20, freq='4d'))

In [148]:
ts

2000-01-15    0.054142
2000-01-19    0.955559
2000-01-23    0.445535
2000-01-27   -0.353374
2000-01-31   -0.877984
2000-02-04    1.066486
2000-02-08   -1.100912
2000-02-12    1.537338
2000-02-16   -0.147089
2000-02-20   -0.861035
2000-02-24    0.415052
2000-02-28   -0.791668
2000-03-03    0.264702
2000-03-07   -1.065872
2000-03-11    0.187459
2000-03-15   -0.867418
2000-03-19   -1.205771
2000-03-23   -0.311638
2000-03-27    0.714520
2000-03-31    0.122078
Freq: 4D, dtype: float64

In [149]:
ts.groupby(offset.rollforward).mean()

2000-01-31    0.044776
2000-02-29    0.016882
2000-03-31   -0.270243
dtype: float64

In [150]:
ts

2000-01-15    0.054142
2000-01-19    0.955559
2000-01-23    0.445535
2000-01-27   -0.353374
2000-01-31   -0.877984
2000-02-04    1.066486
2000-02-08   -1.100912
2000-02-12    1.537338
2000-02-16   -0.147089
2000-02-20   -0.861035
2000-02-24    0.415052
2000-02-28   -0.791668
2000-03-03    0.264702
2000-03-07   -1.065872
2000-03-11    0.187459
2000-03-15   -0.867418
2000-03-19   -1.205771
2000-03-23   -0.311638
2000-03-27    0.714520
2000-03-31    0.122078
Freq: 4D, dtype: float64

In [153]:
ts.resample('M').mean()

2000-01-31    0.044776
2000-02-29    0.016882
2000-03-31   -0.270243
Freq: M, dtype: float64

In [154]:
import pytz

In [156]:
pytz.common_timezones[-5:]

['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']

In [157]:
tz = pytz.timezone('America/New_York')

In [158]:
tz

<DstTzInfo 'America/New_York' LMT-1 day, 19:04:00 STD>

In [159]:
rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D')

In [160]:
ts = pd.Series(np.random.randn(len(rng)), index = rng)

In [161]:
ts

2012-03-09 09:30:00   -1.206039
2012-03-10 09:30:00    1.003950
2012-03-11 09:30:00   -1.387435
2012-03-12 09:30:00   -0.849456
2012-03-13 09:30:00   -0.464058
2012-03-14 09:30:00    0.638738
Freq: D, dtype: float64

In [162]:
print(ts.index.tz)

None


In [163]:
pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',
               '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [164]:
ts

2012-03-09 09:30:00   -1.206039
2012-03-10 09:30:00    1.003950
2012-03-11 09:30:00   -1.387435
2012-03-12 09:30:00   -0.849456
2012-03-13 09:30:00   -0.464058
2012-03-14 09:30:00    0.638738
Freq: D, dtype: float64

In [165]:
ts_utc = ts.tz_localize('UTC')

In [166]:
ts_utc

2012-03-09 09:30:00+00:00   -1.206039
2012-03-10 09:30:00+00:00    1.003950
2012-03-11 09:30:00+00:00   -1.387435
2012-03-12 09:30:00+00:00   -0.849456
2012-03-13 09:30:00+00:00   -0.464058
2012-03-14 09:30:00+00:00    0.638738
Freq: D, dtype: float64

In [167]:
ts_utc.index

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [168]:
ts_utc.tz_convert('America/New_York')

2012-03-09 04:30:00-05:00   -1.206039
2012-03-10 04:30:00-05:00    1.003950
2012-03-11 05:30:00-04:00   -1.387435
2012-03-12 05:30:00-04:00   -0.849456
2012-03-13 05:30:00-04:00   -0.464058
2012-03-14 05:30:00-04:00    0.638738
Freq: D, dtype: float64

In [169]:
ts

2012-03-09 09:30:00   -1.206039
2012-03-10 09:30:00    1.003950
2012-03-11 09:30:00   -1.387435
2012-03-12 09:30:00   -0.849456
2012-03-13 09:30:00   -0.464058
2012-03-14 09:30:00    0.638738
Freq: D, dtype: float64

In [170]:
ts_eastern = ts.tz_localize('America/New_York')

In [171]:
ts_eastern

2012-03-09 09:30:00-05:00   -1.206039
2012-03-10 09:30:00-05:00    1.003950
2012-03-11 09:30:00-04:00   -1.387435
2012-03-12 09:30:00-04:00   -0.849456
2012-03-13 09:30:00-04:00   -0.464058
2012-03-14 09:30:00-04:00    0.638738
dtype: float64

In [172]:
ts_eastern.tz_convert('UTC')

2012-03-09 14:30:00+00:00   -1.206039
2012-03-10 14:30:00+00:00    1.003950
2012-03-11 13:30:00+00:00   -1.387435
2012-03-12 13:30:00+00:00   -0.849456
2012-03-13 13:30:00+00:00   -0.464058
2012-03-14 13:30:00+00:00    0.638738
dtype: float64

In [173]:
ts_eastern.tz_convert('Europe/Berlin')

2012-03-09 15:30:00+01:00   -1.206039
2012-03-10 15:30:00+01:00    1.003950
2012-03-11 14:30:00+01:00   -1.387435
2012-03-12 14:30:00+01:00   -0.849456
2012-03-13 14:30:00+01:00   -0.464058
2012-03-14 14:30:00+01:00    0.638738
dtype: float64

In [174]:
ts.index.tz_localize('Asia/Shanghai')

DatetimeIndex(['2012-03-09 09:30:00+08:00', '2012-03-10 09:30:00+08:00',
               '2012-03-11 09:30:00+08:00', '2012-03-12 09:30:00+08:00',
               '2012-03-13 09:30:00+08:00', '2012-03-14 09:30:00+08:00'],
              dtype='datetime64[ns, Asia/Shanghai]', freq=None)

In [175]:
stamp = pd.Timestamp('2011-03-12 04:00')

In [176]:
stamp

Timestamp('2011-03-12 04:00:00')

In [177]:
stamp_utc = stamp.tz_localize('utc')

In [178]:
stamp_utc

Timestamp('2011-03-12 04:00:00+0000', tz='UTC')

In [179]:
stamp_utc.tz_convert('America/New_York')

Timestamp('2011-03-11 23:00:00-0500', tz='America/New_York')

In [180]:
stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')

In [181]:
stamp_moscow

Timestamp('2011-03-12 04:00:00+0300', tz='Europe/Moscow')

In [182]:
stamp_utc.value

1299902400000000000

In [183]:
stamp_utc.tz_convert('America/New_York').value

1299902400000000000

In [184]:
from pandas.tseries.offsets import Hour

In [185]:
stamp = pd.Timestamp('2012-03-12 01:30', tz='US/Eastern')

In [186]:
stamp

Timestamp('2012-03-12 01:30:00-0400', tz='US/Eastern')

In [187]:
stamp + Hour()

Timestamp('2012-03-12 02:30:00-0400', tz='US/Eastern')

In [188]:
stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')

In [189]:
stamp

Timestamp('2012-11-04 00:30:00-0400', tz='US/Eastern')

In [190]:
stamp + 2 * Hour()

Timestamp('2012-11-04 01:30:00-0500', tz='US/Eastern')

In [191]:
rng = pd.date_range('3/7/2012 9:30', periods = 10, freq='B')

In [192]:
ts = pd.Series(np.random.randn(len(rng)), index = rng)

In [193]:
ts

2012-03-07 09:30:00   -1.102489
2012-03-08 09:30:00    1.856388
2012-03-09 09:30:00   -0.991128
2012-03-12 09:30:00   -0.636950
2012-03-13 09:30:00    0.561893
2012-03-14 09:30:00    0.999570
2012-03-15 09:30:00    1.955502
2012-03-16 09:30:00    1.437956
2012-03-19 09:30:00    0.516988
2012-03-20 09:30:00   -0.943637
Freq: B, dtype: float64

In [194]:
ts1 = ts[:7].tz_localize('Europe/London')

In [195]:
ts2 = ts1[2:].tz_convert('Europe/Moscow')

In [196]:
ts1

2012-03-07 09:30:00+00:00   -1.102489
2012-03-08 09:30:00+00:00    1.856388
2012-03-09 09:30:00+00:00   -0.991128
2012-03-12 09:30:00+00:00   -0.636950
2012-03-13 09:30:00+00:00    0.561893
2012-03-14 09:30:00+00:00    0.999570
2012-03-15 09:30:00+00:00    1.955502
dtype: float64

In [197]:
ts2

2012-03-09 13:30:00+04:00   -0.991128
2012-03-12 13:30:00+04:00   -0.636950
2012-03-13 13:30:00+04:00    0.561893
2012-03-14 13:30:00+04:00    0.999570
2012-03-15 13:30:00+04:00    1.955502
dtype: float64

In [198]:
result = ts1 + ts2

In [199]:
result.index

DatetimeIndex(['2012-03-07 09:30:00+00:00', '2012-03-08 09:30:00+00:00',
               '2012-03-09 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq=None)

In [200]:
p = pd.Period(2007, freq='A-DEC')

In [201]:
p

Period('2007', 'A-DEC')

In [202]:
p + 5

Period('2012', 'A-DEC')

In [203]:
p - 2

Period('2005', 'A-DEC')

In [204]:
pd.Period('2014', freq='A-DEC')- p

<7 * YearEnds: month=12>

In [206]:
rng = pd.period_range('2000-01-01', '2000-6-30', freq='M')

In [207]:
rng

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]', freq='M')

In [208]:
pd.Series(np.random.randn(6), index = rng)

2000-01   -1.381242
2000-02   -1.208826
2000-03    1.793028
2000-04    1.470607
2000-05    0.309390
2000-06   -1.356316
Freq: M, dtype: float64

In [209]:
values = ['2001Q3', '2002Q2', '2003Q1']

In [210]:
index = pd.PeriodIndex(values, freq='Q-DEC')

In [211]:
index

PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]', freq='Q-DEC')

In [212]:
p = pd.Period('2007', freq='A-DEC')

In [213]:
p

Period('2007', 'A-DEC')

In [214]:
p.asfreq('M', how='start')

Period('2007-01', 'M')

In [215]:
p.asfreq('M', how='end')

Period('2007-12', 'M')

In [216]:
p = pd.Period('2007', freq='A-JUN')

In [217]:
p

Period('2007', 'A-JUN')

In [218]:
p.asfreq('M', 'start')

Period('2006-07', 'M')

In [219]:
p.asfreq('M', 'end')

Period('2007-06', 'M')

In [220]:
p = pd.Period('Aug-2007', 'M')

In [222]:
p

Period('2007-08', 'M')

In [223]:
p.asfreq('A-JUN')

Period('2008', 'A-JUN')

In [224]:
rng = pd.period_range('2006', '2009', freq='A-DEC')

In [225]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)

In [226]:
ts

2006   -0.461861
2007   -0.019486
2008   -0.730646
2009    0.756388
Freq: A-DEC, dtype: float64

In [227]:
ts.asfreq('M', how='start')

2006-01   -0.461861
2007-01   -0.019486
2008-01   -0.730646
2009-01    0.756388
Freq: M, dtype: float64

In [228]:
rng = pd.date_range('2000-01-01', periods = 100, freq='D')

In [229]:
ts = pd.Series(np.random.randn(len(rng)), index = rng)

In [230]:
ts

2000-01-01    1.105402
2000-01-02    0.299500
2000-01-03    0.490053
2000-01-04    0.691938
2000-01-05    0.024474
                ...   
2000-04-05   -1.154214
2000-04-06   -0.558583
2000-04-07    1.252412
2000-04-08   -0.015424
2000-04-09    0.219863
Freq: D, Length: 100, dtype: float64

In [232]:
ts.resample('M').mean()

2000-01-31    0.321704
2000-02-29   -0.177398
2000-03-31    0.205958
2000-04-30    0.333670
Freq: M, dtype: float64

In [233]:
ts.resample('M', kind = 'period').mean()

2000-01    0.321704
2000-02   -0.177398
2000-03    0.205958
2000-04    0.333670
Freq: M, dtype: float64

In [234]:
rng = pd.date_range('2000-01-01', periods=12, freq='T')

In [235]:
ts = pd.Series(np.arange(12), index = rng)

In [236]:
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int32

In [237]:
ts.resample('5min', closed='right').sum()

1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int32

In [238]:
ts.resample('5min', closed='right', label='right').sum()

2000-01-01 00:00:00     0
2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    11
Freq: 5T, dtype: int32

In [239]:
ts.resample('5min', closed='right', label='right', loffset='-1s').sum()


>>> df.resample(freq="3s", loffset="8H")

becomes:

>>> from pandas.tseries.frequencies import to_offset
>>> df = df.resample(freq="3s").mean()
>>> df.index = df.index.to_timestamp() + to_offset("8H")

  """Entry point for launching an IPython kernel.


1999-12-31 23:59:59     0
2000-01-01 00:04:59    15
2000-01-01 00:09:59    40
2000-01-01 00:14:59    11
Freq: 5T, dtype: int32

In [240]:
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int32

In [241]:
ts.resample('5min', closed='right', label='right').ohlc()

Unnamed: 0,open,high,low,close
2000-01-01 00:00:00,0,0,0,0
2000-01-01 00:05:00,1,5,1,5
2000-01-01 00:10:00,6,10,6,10
2000-01-01 00:15:00,11,11,11,11


In [242]:
frame = pd.DataFrame(np.random.randn(2,4),
                    index = pd.date_range('1/1/2000', periods=2,
                                         freq='W-Wed'),
                    columns = ['Colorado', 'Texas', 'New York', 'Ohio'])

In [243]:
df_daile = frame.resample('D').asfreq()

In [244]:
df_daile

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.055134,-0.847662,0.516482,-0.02807
2000-01-06,,,,
2000-01-07,,,,
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,-0.603141,-1.43956,-0.504135,1.092807


In [245]:
frame.resample('D').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.055134,-0.847662,0.516482,-0.02807
2000-01-06,0.055134,-0.847662,0.516482,-0.02807
2000-01-07,0.055134,-0.847662,0.516482,-0.02807
2000-01-08,0.055134,-0.847662,0.516482,-0.02807
2000-01-09,0.055134,-0.847662,0.516482,-0.02807
2000-01-10,0.055134,-0.847662,0.516482,-0.02807
2000-01-11,0.055134,-0.847662,0.516482,-0.02807
2000-01-12,-0.603141,-1.43956,-0.504135,1.092807


In [246]:
frame.resample('D').ffill(limit = 2)

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.055134,-0.847662,0.516482,-0.02807
2000-01-06,0.055134,-0.847662,0.516482,-0.02807
2000-01-07,0.055134,-0.847662,0.516482,-0.02807
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,-0.603141,-1.43956,-0.504135,1.092807


In [247]:
frame.resample('W-THU').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-06,0.055134,-0.847662,0.516482,-0.02807
2000-01-13,-0.603141,-1.43956,-0.504135,1.092807


In [3]:
values = pd.Series(['apple', 'orange', 'apple', 'apple']*2)

In [4]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [5]:
values.unique()

array(['apple', 'orange'], dtype=object)

In [6]:
values.value_counts()

apple     6
orange    2
dtype: int64

In [7]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [8]:
pd.value_counts(values)

apple     6
orange    2
dtype: int64

In [9]:
values = pd.Series([0,1,0,0,]*2)

In [10]:
dim = pd.Series(['apple', 'orange'])

In [11]:
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [12]:
dim

0     apple
1    orange
dtype: object

In [13]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [14]:
type(dim.take(values))

pandas.core.series.Series

In [16]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [17]:
N = len(fruits)

In [21]:
df = pd.DataFrame({
    'fruit': fruits,
    'basket_id': np.arange(N), 
    'count': np.random.randint(3,15, size = N), 
    'weight': np.random.randint(0,4,size=N)
},
    columns= ['basket_id', 'fruit', 'count', 'weight']
)

In [22]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,5,0
1,1,orange,13,1
2,2,apple,4,3
3,3,apple,6,0
4,4,apple,14,3
5,5,orange,9,1
6,6,apple,3,3
7,7,apple,3,3


In [23]:
fruit_cat = df['fruit'].astype('category')

In [24]:
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [25]:
c = fruit_cat.values

In [26]:
c

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']
Categories (2, object): ['apple', 'orange']

In [27]:
type(c)

pandas.core.arrays.categorical.Categorical

In [28]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [30]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [31]:
df['fruit'] = df['fruit'].astype('category')

In [32]:
df['fruit']

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [33]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo','bar'])

In [34]:
my_categories

['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']

In [35]:
my_categories.categories

Index(['bar', 'baz', 'foo'], dtype='object')

In [36]:
my_categories.codes

array([2, 0, 1, 2, 0], dtype=int8)

If we have obtained categorical encoded data from another source, we can use the alternative from_codes constructor:

In [37]:
categories = ['foo', 'bar', 'baz']

In [38]:
codes = [0,1,2,0,0,1]

In [39]:
my_cats_2 = pd.Categorical.from_codes(codes, categories)

In [40]:
my_cats_2

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

In [41]:
ordered_cat = pd.Categorical.from_codes(codes, categories, ordered = True)

In [42]:
ordered_cat

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [43]:
my_cats_2.as_ordered()

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [44]:
np.random.seed(12345)

In [45]:
draws = np.random.randn(1000)

In [46]:
draws[:5]

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057])

In [47]:
bins = pd.qcut(draws, 4)

In [50]:
bins.value_counts()

(-2.9499999999999997, -0.684]    250
(-0.684, -0.0101]                250
(-0.0101, 0.63]                  250
(0.63, 3.928]                    250
dtype: int64

In [51]:
bins = pd.qcut(draws, 4, labels = ['Q1', 'Q2', 'Q3', 'Q4'])

In [52]:
bins

['Q2', 'Q3', 'Q2', 'Q2', 'Q4', ..., 'Q3', 'Q2', 'Q1', 'Q3', 'Q4']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [53]:
bins.codes[:10]

array([1, 2, 1, 1, 3, 3, 2, 2, 3, 3], dtype=int8)

In [54]:
bins = pd.Series(bins, name='quartile')

In [55]:
bins

0      Q2
1      Q3
2      Q2
3      Q2
4      Q4
       ..
995    Q3
996    Q2
997    Q1
998    Q3
999    Q4
Name: quartile, Length: 1000, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [57]:
results = (pd.Series(draws).groupby(bins)
.agg(['count', 'min', 'max']).reset_index())

In [58]:
results

Unnamed: 0,quartile,count,min,max
0,Q1,250,-2.949343,-0.685484
1,Q2,250,-0.683066,-0.010115
2,Q3,250,-0.010032,0.628894
3,Q4,250,0.634238,3.927528


In [59]:
results['quartile']

0    Q1
1    Q2
2    Q3
3    Q4
Name: quartile, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [60]:
N = 10000000

In [61]:
draws = pd.Series(np.random.randn(N))

In [62]:
labels = pd.Series(['foo', 'bar', 'baz','qux']*(N//4))

In [63]:
categories = labels.astype('category')

In [64]:
labels.memory_usage()

80000128

In [65]:
categories.memory_usage()

10000320

In [66]:
%time _ = labels.astype('category')

Wall time: 450 ms


In [67]:
s = pd.Series(['a', 'b', 'c','d']* 2)

In [68]:
s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: object

In [69]:
cat_s = s.astype('category')

In [73]:
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [75]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [77]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [78]:
actual_categories = ['a','b','c','d','e']

In [79]:
cat_s2  = cat_s.cat.set_categories(actual_categories)

In [80]:
cat_s2

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

In [81]:
cat_s2.value_counts()

d    2
c    2
b    2
a    2
e    0
dtype: int64

In [82]:
cat_s.value_counts()

d    2
c    2
b    2
a    2
dtype: int64

In [83]:
cat_s3 = cat_s[cat_s.isin(['a','b'])]

In [84]:
cat_s3

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [85]:
cat_s3.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): ['a', 'b']

In [86]:
cat_s  = pd.Series(['a','b','c','d'] *2, dtype='category')

In [87]:
pd.get_dummies(cat_s)

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1


In [88]:
df = pd.DataFrame({
    'key': ['a', 'b','c'] * 4,
    'value': np.arange(12.)
})

In [89]:
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [90]:
g = df.groupby('key').value

In [93]:
list(g)

[('a', 0    0.0
  3    3.0
  6    6.0
  9    9.0
  Name: value, dtype: float64), ('b', 1      1.0
  4      4.0
  7      7.0
  10    10.0
  Name: value, dtype: float64), ('c', 2      2.0
  5      5.0
  8      8.0
  11    11.0
  Name: value, dtype: float64)]

In [94]:
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [97]:
df['value']

0      0.0
1      1.0
2      2.0
3      3.0
4      4.0
5      5.0
6      6.0
7      7.0
8      8.0
9      9.0
10    10.0
11    11.0
Name: value, dtype: float64

In [98]:
g.transform(lambda x: x.mean())

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [99]:
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [100]:
g.transform(lambda x: x ** 2)

0       0.0
1       1.0
2       4.0
3       9.0
4      16.0
5      25.0
6      36.0
7      49.0
8      64.0
9      81.0
10    100.0
11    121.0
Name: value, dtype: float64

In [106]:
g.apply(lambda x: x ** 2)

0       0.0
1       1.0
2       4.0
3       9.0
4      16.0
5      25.0
6      36.0
7      49.0
8      64.0
9      81.0
10    100.0
11    121.0
Name: value, dtype: float64

In [101]:
g.transform(lambda x: x.rank(ascending = False))

0     4.0
1     4.0
2     4.0
3     3.0
4     3.0
5     3.0
6     2.0
7     2.0
8     2.0
9     1.0
10    1.0
11    1.0
Name: value, dtype: float64

In [102]:
def normalize(x):
    return (x-x.mean())/x.std()

In [103]:
g.transform(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [104]:
g.apply(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [105]:
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [107]:
normalized = (df['value'] - g.transform('mean'))/g.transform('std')

In [108]:
normalized

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [109]:
N = 15

In [110]:
times = pd.date_range('2017-05-20 00:00', freq='1min', periods = N)

In [111]:
times

DatetimeIndex(['2017-05-20 00:00:00', '2017-05-20 00:01:00',
               '2017-05-20 00:02:00', '2017-05-20 00:03:00',
               '2017-05-20 00:04:00', '2017-05-20 00:05:00',
               '2017-05-20 00:06:00', '2017-05-20 00:07:00',
               '2017-05-20 00:08:00', '2017-05-20 00:09:00',
               '2017-05-20 00:10:00', '2017-05-20 00:11:00',
               '2017-05-20 00:12:00', '2017-05-20 00:13:00',
               '2017-05-20 00:14:00'],
              dtype='datetime64[ns]', freq='T')

In [112]:
df = pd.DataFrame({
    'time': times, 
    'value': np.arange(N)
})

In [113]:
df

Unnamed: 0,time,value
0,2017-05-20 00:00:00,0
1,2017-05-20 00:01:00,1
2,2017-05-20 00:02:00,2
3,2017-05-20 00:03:00,3
4,2017-05-20 00:04:00,4
5,2017-05-20 00:05:00,5
6,2017-05-20 00:06:00,6
7,2017-05-20 00:07:00,7
8,2017-05-20 00:08:00,8
9,2017-05-20 00:09:00,9


In [116]:
df.set_index('time').resample('5min').count()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,5
2017-05-20 00:05:00,5
2017-05-20 00:10:00,5
