/
data.py
638 lines (541 loc) · 21.2 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
from linearmodels.compat.numpy import lstsq
from linearmodels.compat.pandas import (concat, get_codes, is_categorical,
is_datetime64_any_dtype,
is_numeric_dtype, is_string_dtype,
is_string_like)
from itertools import product
import numpy as np
from pandas import (Categorical, DataFrame, Index, MultiIndex, Series,
get_dummies)
from linearmodels.utility import ensure_unique_column, panel_to_frame
__all__ = ['PanelData']
class _Panel(object):
"""
Convert a MI DataFrame to a 3-d structure where columns are items
Parameters
----------
df : DataFrame
MultiIndex DataFrame containing floats
Notes
-----
Contains the logic needed to transform a MI DataFrame with 2 levels
into a minimal pandas Panel-like object
"""
def __init__(self, df):
self._items = df.columns
index = df.index
self._major_axis = Index(index.levels[1][get_codes(index)[1]]).unique()
self._minor_axis = Index(index.levels[0][get_codes(index)[0]]).unique()
self._full_index = MultiIndex.from_product([self._minor_axis,
self._major_axis])
new_df = df.reindex(self._full_index)
new_df.index.names = df.index.names
self._frame = new_df
i, j, k = len(self._items), len(self._major_axis), len(self.minor_axis)
self._shape = (i, j, k)
self._values = np.swapaxes(np.reshape(np.asarray(new_df).copy().T, (i, k, j)), 1, 2)
@classmethod
def from_array(cls, values, items, major_axis, minor_axis):
index = list(product(minor_axis, major_axis))
index = MultiIndex.from_tuples(index)
i, j, k = len(items), len(major_axis), len(minor_axis)
values = np.swapaxes(values.copy(), 0, 2).ravel()
values = np.reshape(values, ((j * k), i))
df = DataFrame(values, index=index, columns=items)
return cls(df)
@property
def shape(self):
return self._shape
@property
def items(self):
return self._items
@property
def major_axis(self):
return self._major_axis
@property
def minor_axis(self):
return self._minor_axis
@property
def values(self):
return self._values
def to_frame(self):
return self._frame
def convert_columns(s, drop_first):
if is_string_dtype(s.dtype) and s.map(is_string_like).all():
s = s.astype('category')
if is_categorical(s):
out = get_dummies(s, drop_first=drop_first)
out.columns = [str(s.name) + '.' + str(c) for c in out]
return out
return s
def expand_categoricals(x, drop_first):
return concat([convert_columns(x[c], drop_first) for c in x.columns], axis=1)
class PanelData(object):
"""
Abstraction to handle alternative formats for panel data
Parameters
----------
x : {ndarray, Series, DataFrame, DataArray}
Input data
var_name : str, optional
Variable name to use when naming variables in NumPy arrays or
xarray DataArrays
convert_dummies : bool, optional
Flat indicating whether pandas categoricals or string input data
should be converted to dummy variables
drop_first : bool, optional
Flag indicating to drop first dummy category when converting
copy: bool, optional
Flag indicating whether to copy the input. Only has an effect when
x is a DataFrame
Notes
-----
Data can be either 2- or 3-dimensional. The three key dimensions are
* nvar - number of variables
* nobs - number of time periods
* nentity - number of entities
All 3-d inputs should be in the form (nvar, nobs, nentity). With one
exception, 2-d inputs are treated as (nobs, nentity) so that the input
can be treated as-if being (1, nobs, nentity).
If the 2-d input is a pandas DataFrame with a 2-level MultiIndex then the
input is treated differently. Index level 0 is assumed ot be entity.
Index level 1 is time. The columns are the variables. MultiIndex Series
are also accepted and treated as single column MultiIndex DataFrames.
Raises
------
TypeError
If the input type is not supported
ValueError
If the input has the wrong number of dimensions or a MultiIndex
DataFrame does not have 2 levels
"""
def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True, copy=True):
self._var_name = var_name
self._convert_dummies = convert_dummies
self._drop_first = drop_first
self._panel = None
self._shape = None
index_names = ['entity', 'time']
if isinstance(x, PanelData):
x = x.dataframe
self._original = x
if not isinstance(x, (Series, DataFrame, np.ndarray)):
try:
from xarray import DataArray
if isinstance(x, DataArray):
if x.ndim not in (2, 3):
raise ValueError('Only 2-d or 3-d DataArrays are supported')
if x.ndim == 2:
x = x.to_pandas()
else:
items = x.coords[x.dims[0]].values.tolist()
major = x.coords[x.dims[1]].values.tolist()
minor = x.coords[x.dims[2]].values.tolist()
values = x.values
x = panel_to_frame(values, items, major, minor, True)
except ImportError:
pass
if isinstance(x, Series) and isinstance(x.index, MultiIndex):
x = DataFrame(x)
elif isinstance(x, Series):
raise ValueError('Series can only be used with a 2-level MultiIndex')
if isinstance(x, DataFrame):
if isinstance(x, DataFrame):
if isinstance(x.index, MultiIndex):
if len(x.index.levels) != 2:
raise ValueError('DataFrame input must have a '
'MultiIndex with 2 levels')
if isinstance(self._original, (DataFrame, PanelData, Series)):
for i in range(2):
index_names[i] = x.index.levels[i].name or index_names[i]
self._frame = x
if copy:
self._frame = self._frame.copy()
else:
self._frame = DataFrame({var_name: x.T.stack(dropna=False)})
else:
self._frame = x.swapaxes(1, 2).to_frame(filter_observations=False)
elif isinstance(x, np.ndarray):
if x.ndim not in (2, 3):
raise ValueError('2 or 3-d array required for numpy input')
if x.ndim == 2:
x = x[None, :, :]
k, t, n = x.shape
var_str = var_name + '.{0:0>' + str(int(np.log10(k) + .01)) + '}'
variables = [var_name] if k == 1 else [var_str.format(i) for i in range(k)]
entity_str = 'entity.{0:0>' + str(int(np.log10(n) + .01)) + '}'
entities = [entity_str.format(i) for i in range(n)]
time = list(range(t))
x = x.astype(np.float64, copy=False)
panel = _Panel.from_array(x, items=variables, major_axis=time,
minor_axis=entities)
self._fake_panel = panel
self._frame = panel.to_frame()
else:
raise TypeError('Only ndarrays, DataFrames or DataArrays are '
'supported')
if convert_dummies:
self._frame = expand_categoricals(self._frame, drop_first)
self._frame = self._frame.astype(np.float64, copy=False)
time_index = Series(self._frame.index.levels[1])
if not (is_numeric_dtype(time_index.dtype) or
is_datetime64_any_dtype(time_index.dtype)):
raise ValueError('The index on the time dimension must be either '
'numeric or date-like')
# self._k, self._t, self._n = self.panel.shape
self._k, self._t, self._n = self.shape
levels = self._frame.index.levels
for i in range(2):
levels[i].name = index_names[i]
@property
def panel(self):
"""pandas Panel view of data"""
if self._panel is None:
self._panel = _Panel(self._frame)
return self._panel
@property
def dataframe(self):
"""pandas DataFrame view of data"""
return self._frame
@property
def values2d(self):
"""NumPy ndarray view of dataframe"""
return np.asarray(self._frame)
@property
def values3d(self):
"""NumPy ndarray view of panel"""
return self.panel.values
def drop(self, locs):
"""
Parameters
----------
locs : ndarray
Boolean array indicating observations to drop with reference to
the dataframe view of the data
"""
self._frame = self._frame.loc[~locs.ravel()]
self._frame = self._minimize_multiindex(self._frame)
# Reset panel and shape after a drop
self._panel = self._shape = None
self._k, self._t, self._n = self.shape
@property
def shape(self):
"""Shape of panel view of data"""
if self._shape is None:
k = self._frame.shape[1]
index = self._frame.index
t = index.get_level_values(1).unique().shape[0]
n = index.get_level_values(0).unique().shape[0]
self._shape = k, t, n
return self._shape
@property
def ndim(self):
"""Number of dimensions of panel view of data"""
return 3
@property
def isnull(self):
"""Locations with missing observations"""
return np.any(self._frame.isnull(), axis=1)
@property
def nobs(self):
"""Number of time observations"""
return self._t
@property
def nvar(self):
"""Number of variables"""
return self._k
@property
def nentity(self):
"""Number of entities"""
return self._n
@property
def vars(self):
"""List of variable names"""
return list(self._frame.columns)
@property
def time(self):
"""List of time index names"""
index = self._frame.index
return list(index.levels[1][get_codes(index)[1]].unique())
@property
def entities(self):
"""List of entity index names"""
index = self._frame.index
return list(index.levels[0][get_codes(index)[0]].unique())
@property
def entity_ids(self):
"""
Get array containing entity group membership information
Returns
-------
id : ndarray
2d array containing entity ids corresponding dataframe view
"""
return np.asarray(get_codes(self._frame.index)[0])[:, None]
@property
def time_ids(self):
"""
Get array containing time membership information
Returns
-------
id : ndarray
2d array containing time ids corresponding dataframe view
"""
return np.asarray(get_codes(self._frame.index)[1])[:, None]
def _demean_both_low_mem(self, weights):
groups = PanelData(DataFrame(np.c_[self.entity_ids, self.time_ids],
index=self._frame.index),
convert_dummies=False,
copy=False)
return self.general_demean(groups, weights=weights)
def _demean_both(self, weights):
"""
Entity and time demean
Parameters
----------
weights : PanelData, optional
Weights to use in demeaning
"""
if self.nentity > self.nobs:
group = 'entity'
dummy = 'time'
else:
group = 'time'
dummy = 'entity'
e = self.demean(group, weights=weights)
d = self.dummies(dummy, drop_first=True)
d.index = e.index
d = PanelData(d).demean(group, weights=weights)
d = d.values2d
e = e.values2d
resid = e - d @ lstsq(d, e)[0]
resid = DataFrame(resid, index=self._frame.index,
columns=self._frame.columns)
return PanelData(resid)
def general_demean(self, groups, weights=None):
"""
Multi-way demeaning using only groupby
Parameters
----------
groups : PanelData
Arrays with the same size containing group identifiers
weights : PanelData, optional
Weights to use in the weighted demeaning
Returns
-------
demeaned : PanelData
Weighted, demeaned data according to groups
Notes
-----
Iterates until convergence
"""
if not isinstance(groups, PanelData):
groups = PanelData(groups)
if weights is None:
weights = PanelData(DataFrame(np.ones((self._frame.shape[0], 1)),
index=self.index,
columns=['weights']))
weights = weights.values2d
groups = groups.values2d.astype(np.int64, copy=False)
weight_sum = {}
def weighted_group_mean(df, weights, root_w, level):
num = (root_w * df).groupby(level=level).transform('sum')
if level in weight_sum:
denom = weight_sum[level]
else:
denom = weights.groupby(level=level).transform('sum')
weight_sum[level] = denom
return np.asarray(num) / np.asarray(denom)
def demean_pass(frame, weights, root_w):
levels = groups.shape[1]
for level in range(levels):
mu = weighted_group_mean(frame, weights, root_w, level)
if level == 0:
frame = frame - root_w * mu
else:
frame -= root_w * mu
return frame
# Swap out the index for better performance
init_index = DataFrame(groups)
init_index.set_index(list(init_index.columns), inplace=True)
root_w = np.sqrt(weights)
weights = DataFrame(weights, index=init_index.index)
wframe = root_w * self._frame
wframe.index = init_index.index
previous = wframe
current = demean_pass(previous, weights, root_w)
if groups.shape[1] == 1:
current.index = self._frame.index
return PanelData(current)
exclude = np.ptp(np.asarray(self._frame), 0) == 0
max_rmse = np.sqrt(np.asarray(self._frame).var(0).max())
scale = np.asarray(self._frame.std())
exclude = exclude | (scale < 1e-14 * max_rmse)
replacement = np.maximum(scale, 1)
scale[exclude] = replacement[exclude]
scale = scale[None, :]
while np.max(np.abs(np.asarray(current) - np.asarray(previous)) / scale) > 1e-8:
previous = current
current = demean_pass(previous, weights, root_w)
current.index = self._frame.index
return PanelData(current)
def demean(self, group='entity', weights=None, return_panel=True, low_memory=False):
"""
Demeans data by either entity or time group
Parameters
----------
group : {'entity', 'time', 'both'}
Group to use in demeaning
weights : PanelData, optional
Weights to implement weighted averaging
return_panel : bool
Flag indicating to return a PanelData object. If False, a 2-d
NumPy representation of the panel is returned
low_memory : bool
Flag indicating whether to use a low memory implementation
that avoids constructing dummy variables. Only relevant when
group is 'both'
Returns
-------
demeaned : PanelData
Demeaned data according to type
Notes
-----
If weights are provided, the values returned will be scaled by
the square root of the weights so that they can be used in WLS
estimation.
"""
if group not in ('entity', 'time', 'both'):
raise ValueError
if group == 'both':
if not low_memory:
return self._demean_both(weights)
else:
return self._demean_both_low_mem(weights)
level = 0 if group == 'entity' else 1
if weights is None:
group_mu = self._frame.groupby(level=level).transform('mean')
out = self._frame - group_mu
if not return_panel:
return np.asarray(out)
return PanelData(out)
else:
w = weights.values2d
frame = self._frame.copy()
frame = w * frame
weighted_sum = frame.groupby(level=level).transform('sum')
frame.iloc[:, :] = w
sum_weights = frame.groupby(level=level).transform('sum')
group_mu = weighted_sum / sum_weights
out = np.sqrt(w) * (self._frame - group_mu)
if not return_panel:
return np.asarray(out)
return PanelData(out)
def __str__(self):
return self.__class__.__name__ + '\n' + str(self._frame)
def __repr__(self):
return self.__str__() + '\n' + self.__class__.__name__ + ' object, id: ' + hex(id(self))
def _repr_html_(self):
return self.__class__.__name__ + '<br/>' + self._frame._repr_html_()
def count(self, group='entity'):
"""
Count number of observations by entity or time
Parameters
----------
group : {'entity', 'time'}
Group to count
Returns
-------
count : DataFrame
Counts according to type. Either (entity by var) or (time by var)
"""
level = 0 if group == 'entity' else 1
reindex = self.entities if group == 'entity' else self.time
out = self._frame.groupby(level=level).count()
return out.reindex(reindex)
@property
def index(self):
"""Return the index of the multi-index dataframe view"""
return self._frame.index
def copy(self):
"""Return a deep copy"""
return PanelData(self._frame.copy(), var_name=self._var_name,
convert_dummies=self._convert_dummies, drop_first=self._drop_first)
def mean(self, group='entity', weights=None):
"""
Compute data mean by either entity or time group
Parameters
----------
group : {'entity', 'time'}
Group to use in demeaning
weights : PanelData, optional
Weights to implement weighted averaging
Returns
-------
mean : DataFrame
Data mean according to type. Either (entity by var) or (time by var)
"""
level = 0 if group == 'entity' else 1
if weights is None:
mu = self._frame.groupby(level=level).mean()
else:
w = weights.values2d
frame = self._frame.copy()
frame = w * frame
weighted_sum = frame.groupby(level=level).sum()
frame.iloc[:, :] = w
sum_weights = frame.groupby(level=level).sum()
mu = weighted_sum / sum_weights
reindex = self.entities if group == 'entity' else self.time
out = mu.reindex(reindex)
return out
def first_difference(self):
"""
Compute first differences of variables
Returns
-------
diffs : PanelData
Differenced values
"""
diffs = self.panel.values
diffs = diffs[:, 1:] - diffs[:, :-1]
diffs = panel_to_frame(diffs, self.panel.items, self.panel.major_axis[1:],
self.panel.minor_axis, True)
diffs = diffs.reindex(self._frame.index).dropna(how='any')
return PanelData(diffs)
@staticmethod
def _minimize_multiindex(df):
index_cols = list(df.index.names)
orig_names = index_cols[:]
for i, col in enumerate(index_cols):
col = ensure_unique_column(col, df)
index_cols[i] = col
df.index.names = index_cols
df = df.reset_index()
df = df.set_index(index_cols)
df.index.names = orig_names
return df
def dummies(self, group='entity', drop_first=False):
"""
Generate entity or time dummies
Parameters
----------
group : {'entity', 'time'}, optional
Type of dummies to generate
drop_first : bool, optional
Flag indicating that the dummy column corresponding to the first
entity or time period should be dropped
Returns
-------
dummies : DataFrame
Dummy variables
"""
if group not in ('entity', 'time'):
raise ValueError
axis = 0 if group == 'entity' else 1
labels = get_codes(self._frame.index)
levels = self._frame.index.levels
cat = Categorical(levels[axis][labels[axis]])
dummies = get_dummies(cat, drop_first=drop_first)
cols = self.entities if group == 'entity' else self.time
return dummies[[c for c in cols if c in dummies]].astype(np.float64, copy=False)