In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [2]:
values = pd.Series(['apple','orange','apple','apple']*2)

In [3]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [4]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [5]:
pd.value_counts(values)

apple     6
orange    2
Name: count, dtype: int64

In [6]:
values = pd.Series([0,1,0,0]*2)

In [7]:
dim = pd.Series(['apple','orange'])

In [8]:
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [9]:
dim

0     apple
1    orange
dtype: object

In [10]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [11]:
fruits = ['apple','orange','apple','apple']*2

In [12]:
N = len(fruits)

In [13]:
df = pd.DataFrame({'fruit':fruits,
                   'basket_id':np.arange(N),
                   'count':np.random.randint(3,15,size=N),
                   'weight':np.random.uniform(0,4,size=N),},
                   columns=['basket_id','fruit','count','weight'])

In [14]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,5,3.6903
1,1,orange,14,1.522812
2,2,apple,10,2.793713
3,3,apple,14,2.554104
4,4,apple,6,0.986794
5,5,orange,13,3.340644
6,6,apple,3,3.952243
7,7,apple,12,3.363779


In [15]:
fruits_cat = df['fruit'].astype('category')

In [16]:
fruits_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [17]:
c=fruits_cat.values

In [18]:
type(c)

pandas.core.arrays.categorical.Categorical

In [19]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [20]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [21]:
df['fruit'] = df['fruit'].astype('category')

In [22]:
df.fruit

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [23]:
my_categories = pd.Categorical(['foo','bar','baz','foo','bar'])

In [24]:
my_categories

['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']

In [25]:
categories = ['foo','bar','baz']
codes = [2,1,2,0,2,1]
my_cats_2 = pd.Categorical.from_codes(codes,categories)
my_cats_2

['baz', 'bar', 'baz', 'foo', 'baz', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

In [26]:
orderd_cat = pd.Categorical.from_codes(codes,categories,ordered=True)

In [27]:
orderd_cat

['baz', 'bar', 'baz', 'foo', 'baz', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [28]:
my_cats_2.as_ordered()

['baz', 'bar', 'baz', 'foo', 'baz', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [29]:
mydf = pd.DataFrame(np.random.randn(10,4),columns=['A','B','C','D'],)

In [30]:
mydf.A = ['apple','apple','banana','orange','apple']*2
mydf.B = ['one','two']*5
# mydf.C = ['ichi','ni','san']*3+

In [31]:
type(mydf[['A','B']].astype('category'))

pandas.core.frame.DataFrame

In [32]:
np.random.seed(12345)

In [33]:
draws = np.random.randn(1000)

In [34]:
draws[:5]

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057])

In [35]:
bins = pd.qcut(draws,4)

In [36]:
bins = pd.qcut(draws,q=4,labels=['Q1','Q2','Q3','Q4'])

In [37]:
bins.codes[:10]

array([1, 2, 1, 1, 3, 3, 2, 2, 3, 3], dtype=int8)

In [38]:
bins = pd.Series(bins,name='quartile')

In [39]:
results = (pd.Series(draws).groupby(bins)
           .agg(['count','min','max']).reset_index())

In [40]:
results

Unnamed: 0,quartile,count,min,max
0,Q1,250,-2.949343,-0.685484
1,Q2,250,-0.683066,-0.010115
2,Q3,250,-0.010032,0.628894
3,Q4,250,0.634238,3.927528


In [41]:
results['quartile']

0    Q1
1    Q2
2    Q3
3    Q4
Name: quartile, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
N = 1000_0000

In [3]:
draws = pd.Series(np.random.randn(N))

In [4]:
labels = pd.Series(['foo','bar','baz','qux']*(N//4))

In [5]:
categories = labels.astype('category')

In [6]:
labels.memory_usage()

80000132

In [7]:
categories.memory_usage()

10000336

In [8]:
%time _ =labels.astype('category')

CPU times: total: 328 ms
Wall time: 324 ms


In [9]:
%timeit _ = labels.astype('category')

324 ms ± 32.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit labels.groupby(labels.values)

7.83 µs ± 1.48 µs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [11]:
list(categories.groupby(labels.values))

[('bar',
  1          bar
  5          bar
  9          bar
  13         bar
  17         bar
            ... 
  9999981    bar
  9999985    bar
  9999989    bar
  9999993    bar
  9999997    bar
  Length: 2500000, dtype: category
  Categories (4, object): ['bar', 'baz', 'foo', 'qux']),
 ('baz',
  2          baz
  6          baz
  10         baz
  14         baz
  18         baz
            ... 
  9999982    baz
  9999986    baz
  9999990    baz
  9999994    baz
  9999998    baz
  Length: 2500000, dtype: category
  Categories (4, object): ['bar', 'baz', 'foo', 'qux']),
 ('foo',
  0          foo
  4          foo
  8          foo
  12         foo
  16         foo
            ... 
  9999980    foo
  9999984    foo
  9999988    foo
  9999992    foo
  9999996    foo
  Length: 2500000, dtype: category
  Categories (4, object): ['bar', 'baz', 'foo', 'qux']),
 ('qux',
  3          qux
  7          qux
  11         qux
  15         qux
  19         qux
            ... 
  9999983    qux
  999998

In [12]:
s = pd.Series(['a','b','c','d']*2)

In [13]:
cat_s = s.astype('category',)

In [14]:
type(cat_s)

pandas.core.series.Series

In [16]:
s = pd.Series(['a','b','c','d']*2)

In [18]:
cat_s = s.astype('category')

In [20]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [21]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [22]:
actual_categories = ['a','b','c','d','e']

In [23]:
cat_s2 = cat_s.cat.set_categories(actual_categories,)

In [27]:
cat_s2.value_counts()

a    2
b    2
c    2
d    2
e    0
Name: count, dtype: int64

In [28]:
cat_s3 = cat_s[cat_s.isin(['a','b'])]

In [31]:
cat_s3.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): ['a', 'b']

In [45]:
cat_s.cat.set_categories(['a','b'],rename='one')

0      a
1      b
2    NaN
3    NaN
4      a
5      b
6    NaN
7    NaN
dtype: category
Categories (2, object): ['a', 'b']

In [56]:
cat_s.cat.set_categories(['a','b','d','e'],rename=False)

0      a
1      b
2    NaN
3      d
4      a
5      b
6    NaN
7      d
dtype: category
Categories (4, object): ['a', 'b', 'd', 'e']

In [58]:
cat_s.cat.add_categories(['f','e'],)

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (6, object): ['a', 'b', 'c', 'd', 'f', 'e']

In [59]:
cat_s.cat.as_ordered()

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a' < 'b' < 'c' < 'd']

In [64]:
def rn(x):
    if x == 'a':
        return 'one'
    elif x =='b':
        return 'two'
    else:
        return 'three'
cat_s.cat.rename_categories(rn,)

ValueError: Categorical categories must be unique

In [79]:
cat_s = pd.Series(['a','b','c','d']*2,
                  dtype='category',
                  )

In [80]:
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [81]:
pd.get_dummies(cat_s)

Unnamed: 0,a,b,c,d
0,True,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,False,False,True
4,True,False,False,False
5,False,True,False,False
6,False,False,True,False
7,False,False,False,True


In [90]:
def f(x):
    result = ''
    for i in x:
        result +=i
    return result
cat_s.groupby(cat_s.values).apply(f)

a    aa
b    bb
c    cc
d    dd
dtype: object

In [91]:
cat_s.values.groupby()

['a', 'b', 'c', 'd', 'a', 'b', 'c', 'd']
Categories (4, object): ['a', 'b', 'c', 'd']

In [92]:
df = pd.DataFrame({'key':['a','b','c']*4,
                   'value':np.arange(12)})

In [93]:
df

Unnamed: 0,key,value
0,a,0
1,b,1
2,c,2
3,a,3
4,b,4
5,c,5
6,a,6
7,b,7
8,c,8
9,a,9


In [110]:
g = df.groupby('key')['value']

In [111]:
list(g)

[('a',
  0    0
  3    3
  6    6
  9    9
  Name: value, dtype: int32),
 ('b',
  1      1
  4      4
  7      7
  10    10
  Name: value, dtype: int32),
 ('c',
  2      2
  5      5
  8      8
  11    11
  Name: value, dtype: int32)]

In [112]:
g.transform(lambda x:x.mean())

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [113]:
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [114]:
g.transform(lambda x:x*2)

0      0
1      2
2      4
3      6
4      8
5     10
6     12
7     14
8     16
9     18
10    20
11    22
Name: value, dtype: int32

In [115]:
g.transform(lambda x:x.rank(ascending=False))

0     4.0
1     4.0
2     4.0
3     3.0
4     3.0
5     3.0
6     2.0
7     2.0
8     2.0
9     1.0
10    1.0
11    1.0
Name: value, dtype: float64

In [116]:
def normalize(x):
    return (x-x.mean())/x.std()

In [117]:
g.transform(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [118]:
g.apply(normalize)

key    
a    0    -1.161895
     3    -0.387298
     6     0.387298
     9     1.161895
b    1    -1.161895
     4    -0.387298
     7     0.387298
     10    1.161895
c    2    -1.161895
     5    -0.387298
     8     0.387298
     11    1.161895
Name: value, dtype: float64

In [119]:
g.transform('mean',)

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [120]:
normalized = (df['value']-g.transform('mean'))/g.transform('std')

In [121]:
normalized

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [122]:
N = 15

In [123]:
times = pd.date_range('2017-05-20 00:00',freq='T',periods=N)

In [124]:
df = pd.DataFrame({'time':times,
                   'value':np.arange(N)})

In [125]:
df

Unnamed: 0,time,value
0,2017-05-20 00:00:00,0
1,2017-05-20 00:01:00,1
2,2017-05-20 00:02:00,2
3,2017-05-20 00:03:00,3
4,2017-05-20 00:04:00,4
5,2017-05-20 00:05:00,5
6,2017-05-20 00:06:00,6
7,2017-05-20 00:07:00,7
8,2017-05-20 00:08:00,8
9,2017-05-20 00:09:00,9


In [126]:
df.set_index('time').resample('5T').count()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,5
2017-05-20 00:05:00,5
2017-05-20 00:10:00,5


In [127]:
df2 = pd.DataFrame({'time':times.repeat(3),
                    'key':np.tile(['a','b','c'],N),
                    'values':np.arange(N*3)})

In [129]:
df2[:7]

Unnamed: 0,time,key,values
0,2017-05-20 00:00:00,a,0
1,2017-05-20 00:00:00,b,1
2,2017-05-20 00:00:00,c,2
3,2017-05-20 00:01:00,a,3
4,2017-05-20 00:01:00,b,4
5,2017-05-20 00:01:00,c,5
6,2017-05-20 00:02:00,a,6


In [147]:
time_grouper = pd.core.resample.TimeGrouper('5T')
resampled = (df2.set_index('time').groupby(['key',time_grouper]).sum())
resampled

Unnamed: 0_level_0,Unnamed: 1_level_0,values
key,time,Unnamed: 2_level_1
a,2017-05-20 00:00:00,30
a,2017-05-20 00:05:00,105
a,2017-05-20 00:10:00,180
b,2017-05-20 00:00:00,35
b,2017-05-20 00:05:00,110
b,2017-05-20 00:10:00,185
c,2017-05-20 00:00:00,40
c,2017-05-20 00:05:00,115
c,2017-05-20 00:10:00,190


In [150]:
def f(x):
    return x.resample('5T',).values.sum()
my_resampled = df2.set_index('time').groupby('key').apply(f)
pd.DataFrame(my_resampled.stack())


Unnamed: 0_level_0,Unnamed: 1_level_0,0
key,time,Unnamed: 2_level_1
a,2017-05-20 00:00:00,30
a,2017-05-20 00:05:00,105
a,2017-05-20 00:10:00,180
b,2017-05-20 00:00:00,35
b,2017-05-20 00:05:00,110
b,2017-05-20 00:10:00,185
c,2017-05-20 00:00:00,40
c,2017-05-20 00:05:00,115
c,2017-05-20 00:10:00,190


In [143]:
my_resampled

time,2017-05-20 00:00:00,2017-05-20 00:05:00,2017-05-20 00:10:00
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,30,105,180
b,35,110,185
c,40,115,190


In [165]:
df1 = pd.DataFrame(np.random.randn(10,4),
                   columns=['col1','col2','col3','col4'])
df1['key'] = ['a','b','c','a','c']*2

In [166]:
df2 = df1[df1['col1']>0]

In [167]:
df2['col1_demeand'] = df2['col1']-df2['col1'].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['col1_demeand'] = df2['col1']-df2['col1'].mean()


In [168]:
df2

Unnamed: 0,col1,col2,col3,col4,key,col1_demeand
0,1.599329,-1.088542,0.57083,-0.254046,a,0.582285
1,0.754265,0.250417,1.209039,0.272329,b,-0.262779
3,0.333107,1.485727,0.527276,0.339034,a,-0.683937
4,0.585796,-1.904688,0.920257,-0.48938,c,-0.431248
5,1.007638,-0.017614,1.950778,-0.424692,a,-0.009406
7,1.822129,0.149543,-0.988405,-0.061145,c,0.805085


In [170]:
result = df2.groupby('key').col1_demeand.std()

In [171]:
result

key
a    0.633563
b         NaN
c    0.874219
Name: col1_demeand, dtype: float64

In [186]:
np.random.seed(12345)
result = (pd.DataFrame(np.random.randn(100,4),columns=['col1','col2','col3','col4'])
            .assign(key=['a','b','c','b','a']*20)
            # [lambda x: x.col1 > 0]
            .query('col1 > 0')
            .assign(col1_demeaned=lambda x: x['col1']-x['col1'].mean())
            .groupby('key').col1_demeaned.std()
            ) 
result

key
a    0.522315
b    0.574511
c    0.625781
Name: col1_demeaned, dtype: float64

In [183]:
def groupby_demean(df,by,cols):
    result = df.copy()
    g = df.groupby(by)
    for c in cols:
        result[c] = df[c] - g[c].transform('mean')
    return result


In [184]:
result = (df[df.])

SyntaxError: invalid syntax (2407298609.py, line 1)