forked from fastai/fastai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
177 lines (158 loc) · 9.06 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"Data loading pipeline for structured data support. Loads from pandas DataFrame"
from ..torch_core import *
from .transform import *
from ..basic_data import *
from ..data_block import *
from ..basic_train import *
from .models import *
from pandas.api.types import is_numeric_dtype, is_categorical_dtype
__all__ = ['TabularDataBunch', 'TabularLine', 'TabularList', 'TabularProcessor', 'tabular_learner']
OptTabTfms = Optional[Collection[TabularProc]]
#def emb_sz_rule(n_cat:int)->int: return min(50, (n_cat//2)+1)
def emb_sz_rule(n_cat:int)->int: return min(600, round(1.6 * n_cat**0.56))
def def_emb_sz(classes, n, sz_dict=None):
"Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
sz_dict = ifnone(sz_dict, {})
n_cat = len(classes[n])
sz = sz_dict.get(n, int(emb_sz_rule(n_cat))) # rule of thumb
return n_cat,sz
class TabularLine(ItemBase):
"Basic item for tabular data."
def __init__(self, cats, conts, classes, names):
self.cats,self.conts,self.classes,self.names = cats,conts,classes,names
self.data = [tensor(cats), tensor(conts)]
def __str__(self):
res = ''
for c, n in zip(self.cats, self.names[:len(self.cats)]):
res += f"{n} {(self.classes[n][c])}; "
for c,n in zip(self.conts, self.names[len(self.cats):]):
res += f'{n} {c:.4f}; '
return res
class TabularProcessor(PreProcessor):
"Regroup the `procs` in one `PreProcessor`."
def __init__(self, ds:ItemBase=None, procs=None):
procs = ifnone(procs, ds.procs if ds is not None else None)
self.procs = listify(procs)
def process_one(self, item):
df = pd.DataFrame([item,item])
for proc in self.procs: proc(df, test=True)
if len(self.cat_names) != 0:
codes = np.stack([c.cat.codes.values for n,c in df[self.cat_names].items()], 1).astype(np.int64) + 1
else: codes = [[]]
if len(self.cont_names) != 0:
conts = np.stack([c.astype('float32').values for n,c in df[self.cont_names].items()], 1)
else: conts = [[]]
classes = None
col_names = list(df[self.cat_names].columns.values) + list(df[self.cont_names].columns.values)
return TabularLine(codes[0], conts[0], classes, col_names)
def process(self, ds):
if ds.inner_df is None:
ds.classes,ds.cat_names,ds.cont_names = self.classes,self.cat_names,self.cont_names
ds.preprocessed = True
return
for i,proc in enumerate(self.procs):
if isinstance(proc, TabularProc): proc(ds.inner_df, test=True)
else:
#cat and cont names may have been changed by transform (like Fill_NA)
proc = proc(ds.cat_names, ds.cont_names)
proc(ds.inner_df)
ds.cat_names,ds.cont_names = proc.cat_names,proc.cont_names
self.procs[i] = proc
self.cat_names,self.cont_names = ds.cat_names,ds.cont_names
if len(ds.cat_names) != 0:
ds.codes = np.stack([c.cat.codes.values for n,c in ds.inner_df[ds.cat_names].items()], 1).astype(np.int64) + 1
self.classes = ds.classes = OrderedDict({n:np.concatenate([['#na#'],c.cat.categories.values])
for n,c in ds.inner_df[ds.cat_names].items()})
cat_cols = list(ds.inner_df[ds.cat_names].columns.values)
else: ds.codes,ds.classes,self.classes,cat_cols = None,None,None,[]
if len(ds.cont_names) != 0:
ds.conts = np.stack([c.astype('float32').values for n,c in ds.inner_df[ds.cont_names].items()], 1)
cont_cols = list(ds.inner_df[ds.cont_names].columns.values)
else: ds.conts,cont_cols = None,[]
ds.col_names = cat_cols + cont_cols
ds.preprocessed = True
class TabularDataBunch(DataBunch):
"Create a `DataBunch` suitable for tabular data."
@classmethod
def from_df(cls, path, df:DataFrame, dep_var:str, valid_idx:Collection[int], procs:OptTabTfms=None,
cat_names:OptStrList=None, cont_names:OptStrList=None, classes:Collection=None,
test_df=None, bs:int=64, val_bs:int=None, num_workers:int=defaults.cpus, dl_tfms:Optional[Collection[Callable]]=None,
device:torch.device=None, collate_fn:Callable=data_collate, no_check:bool=False)->DataBunch:
"Create a `DataBunch` from `df` and `valid_idx` with `dep_var`. `kwargs` are passed to `DataBunch.create`."
cat_names = ifnone(cat_names, []).copy()
cont_names = ifnone(cont_names, list(set(df)-set(cat_names)-{dep_var}))
procs = listify(procs)
src = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_idx(valid_idx))
src = src.label_from_df(cols=dep_var) if classes is None else src.label_from_df(cols=dep_var, classes=classes)
if test_df is not None: src.add_test(TabularList.from_df(test_df, cat_names=cat_names, cont_names=cont_names,
processor = src.train.x.processor))
return src.databunch(path=path, bs=bs, val_bs=val_bs, num_workers=num_workers, device=device,
collate_fn=collate_fn, no_check=no_check)
class TabularList(ItemList):
"Basic `ItemList` for tabular data."
_item_cls=TabularLine
_processor=TabularProcessor
_bunch=TabularDataBunch
def __init__(self, items:Iterator, cat_names:OptStrList=None, cont_names:OptStrList=None,
procs=None, **kwargs)->'TabularList':
super().__init__(range_of(items), **kwargs)
#dataframe is in inner_df, items is just a range of index
if cat_names is None: cat_names = []
if cont_names is None: cont_names = []
self.cat_names,self.cont_names,self.procs = cat_names,cont_names,procs
self.copy_new += ['cat_names', 'cont_names', 'procs']
self.preprocessed = False
@classmethod
def from_df(cls, df:DataFrame, cat_names:OptStrList=None, cont_names:OptStrList=None, procs=None, **kwargs)->'ItemList':
"Get the list of inputs in the `col` of `path/csv_name`."
return cls(items=range(len(df)), cat_names=cat_names, cont_names=cont_names, procs=procs, inner_df=df.copy(), **kwargs)
def get(self, o):
if not self.preprocessed: return self.inner_df.iloc[o] if hasattr(self, 'inner_df') else self.items[o]
codes = [] if self.codes is None else self.codes[o]
conts = [] if self.conts is None else self.conts[o]
return self._item_cls(codes, conts, self.classes, self.col_names)
def get_emb_szs(self, sz_dict=None):
"Return the default embedding sizes suitable for this data or takes the ones in `sz_dict`."
return [def_emb_sz(self.classes, n, sz_dict) for n in self.cat_names]
def reconstruct(self, t:Tensor):
return self._item_cls(t[0], t[1], self.classes, self.col_names)
def show_xys(self, xs, ys)->None:
"Show the `xs` (inputs) and `ys` (targets)."
from IPython.display import display, HTML
items,names = [], xs[0].names + ['target']
for i, (x,y) in enumerate(zip(xs,ys)):
res = []
cats = x.cats if len(x.cats.size()) > 0 else []
conts = x.conts if len(x.conts.size()) > 0 else []
for c, n in zip(cats, x.names[:len(cats)]):
res.append(x.classes[n][c])
res += [f'{c:.4f}' for c in conts] + [y]
items.append(res)
items = np.array(items)
df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)}, columns=names)
with pd.option_context('display.max_colwidth', -1):
display(HTML(df.to_html(index=False)))
def show_xyzs(self, xs, ys, zs):
"Show `xs` (inputs), `ys` (targets) and `zs` (predictions)."
from IPython.display import display, HTML
items,names = [], xs[0].names + ['target', 'prediction']
for i, (x,y,z) in enumerate(zip(xs,ys,zs)):
res = []
cats = x.cats if len(x.cats.size()) > 0 else []
conts = x.conts if len(x.conts.size()) > 0 else []
for c, n in zip(cats, x.names[:len(cats)]):
res.append(str(x.classes[n][c]))
res += [f'{c:.4f}' for c in conts] + [y, z]
items.append(res)
items = np.array(items)
df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)}, columns=names)
with pd.option_context('display.max_colwidth', -1):
display(HTML(df.to_html(index=False)))
def tabular_learner(data:DataBunch, layers:Collection[int], emb_szs:Dict[str,int]=None, metrics=None,
ps:Collection[float]=None, emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, **learn_kwargs):
"Get a `Learner` using `data`, with `metrics`, including a `TabularModel` created using the remaining params."
emb_szs = data.get_emb_szs(ifnone(emb_szs, {}))
model = TabularModel(emb_szs, len(data.cont_names), out_sz=data.c, layers=layers, ps=ps, emb_drop=emb_drop,
y_range=y_range, use_bn=use_bn)
return Learner(data, model, metrics=metrics, **learn_kwargs)