In [1]:
import numpy as np
import pandas as pd
#matplotlib inline
from matplotlib import pyplot as plt


In [2]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], [1, 3, 5, 1, 3]], names=['city', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.79164,1.831387,-1.878284,0.713854,-1.530472
1,1.401598,1.207293,-2.015177,0.083552,-0.753449
2,1.452922,0.031919,1.727432,1.620893,-1.750242
3,-1.291159,0.311164,0.797567,-0.647641,1.896994


In [3]:
hier_df.groupby(level=0, axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [4]:
frame = pd.DataFrame({'data1': np.random.randn(1000), 'data2': np.random.randn(1000)})
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]

0     (0.144, 1.767]
1     (0.144, 1.767]
2    (-1.478, 0.144]
3    (-1.478, 0.144]
4     (0.144, 1.767]
5    (-1.478, 0.144]
6    (-1.478, 0.144]
7     (0.144, 1.767]
8    (-1.478, 0.144]
9     (0.144, 1.767]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.107, -1.478] < (-1.478, 0.144] < (0.144, 1.767] < (1.767, 3.389]]

In [5]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}

In [6]:
grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.107, -1.478]",58.0,2.720276,3.9e-05,-2.280451
"(-1.478, 0.144]",474.0,2.749893,-0.026966,-2.92348
"(0.144, 1.767]",422.0,3.332802,0.065296,-2.993589
"(1.767, 3.389]",46.0,2.452914,0.060133,-2.073758


In [7]:
grouping = pd.qcut(frame.data1, 10, labels=False)
grouping

0      9
1      7
2      2
3      4
4      7
5      0
6      5
7      9
8      3
9      6
10     2
11     4
12     9
13     4
14     2
15     0
16     4
17     0
18     6
19     0
20     6
21     4
22     3
23     1
24     0
25     7
26     8
27     4
28     8
29     5
      ..
970    5
971    7
972    1
973    7
974    8
975    5
976    9
977    9
978    2
979    8
980    4
981    2
982    8
983    3
984    1
985    6
986    9
987    6
988    0
989    9
990    3
991    0
992    7
993    4
994    3
995    9
996    7
997    2
998    7
999    3
Name: data1, Length: 1000, dtype: int64

In [8]:
grouped = frame.data1.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,-1.261034,-1.692174,-3.100752
1,100.0,-0.795712,-0.990434,-1.244674
2,100.0,-0.437656,-0.617621,-0.791187
3,100.0,-0.199452,-0.317744,-0.437648
4,100.0,0.068111,-0.051954,-0.19794
5,100.0,0.331859,0.206379,0.073136
6,100.0,0.593335,0.469285,0.332756
7,100.0,0.921012,0.750329,0.594766
8,100.0,1.324039,1.120658,0.92118
9,100.0,3.38941,1.834898,1.327054


In [9]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1   -0.675902
2         NaN
3    0.516477
4         NaN
5   -0.450760
dtype: float64

In [10]:
s.fillna(s.mean())

0   -0.203395
1   -0.675902
2   -0.203395
3    0.516477
4   -0.203395
5   -0.450760
dtype: float64

In [11]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data

Ohio          1.265741
New York      1.138765
Vermont       0.606044
Florida       0.160820
Oregon       -0.444231
Nevada       -0.369161
California   -1.109645
Idaho         0.431959
dtype: float64

In [12]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio          1.265741
New York      1.138765
Vermont            NaN
Florida       0.160820
Oregon       -0.444231
Nevada             NaN
California   -1.109645
Idaho              NaN
dtype: float64

In [13]:
data.groupby(group_key).mean()


East    0.855109
West   -0.776938
dtype: float64

In [14]:
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

Ohio          1.265741
New York      1.138765
Vermont       0.855109
Florida       0.160820
Oregon       -0.444231
Nevada       -0.776938
California   -1.109645
Idaho        -0.776938
dtype: float64

In [16]:
df = pd.DataFrame({'category': ['a'] * 4 + ['b'] * 4,
                  'data': np.random.randn(8),
                   'weights': np.random.randn(8)})
df

Unnamed: 0,category,data,weights
0,a,0.626191,-1.387737
1,a,0.447117,0.485557
2,a,-0.783357,-1.523252
3,a,1.992902,1.119677
4,b,0.2287,1.581389
5,b,-0.461513,0.117926
6,b,-0.597102,1.135408
7,b,0.94981,1.17805


In [17]:
import statsmodels.api as sm

In [18]:
def regress(data, year, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    reult = sm.OLS(Y, X).fit()
    return result.params