/
data.py
104 lines (93 loc) · 2.91 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from .. import utils as U
from ..imports import *
from . import preprocessor as pp
def tabular_from_df(
train_df,
label_columns=[],
date_columns=[],
val_df=None,
val_pct=0.1,
is_regression=False,
max_card=20,
random_state=None,
verbose=1,
):
train_df = train_df.copy()
# strip space from string columns and check supplied val_df
train_type_dict = pp.clean_df(train_df, val_df=val_df, return_types=True)
# check label_columns
if label_columns is None or (
isinstance(label_columns, (list, np.ndarray)) and len(label_columns) == 0
):
raise ValueError("label_columns is required")
if isinstance(label_columns, (list, np.ndarray)) and len(label_columns) == 1:
label_columns = label_columns[0]
# define original predictor_columns
predictor_columns = [
col for col in train_df.columns.values if col not in label_columns
]
# create validation set
if val_df is None:
if val_pct:
df = train_df.copy()
prop = 1 - val_pct
if random_state is not None:
np.random.seed(42)
msk = np.random.rand(len(df)) < prop
train_df = df[msk]
val_df = df[~msk]
else:
val_df = val_df.copy()
procs = [pp.FillMissing, pp.Categorify, pp.Normalize]
preproc = pp.TabularPreprocessor(
predictor_columns,
label_columns,
date_columns=date_columns,
is_regression=is_regression,
procs=procs,
max_card=max_card,
)
trn = preproc.preprocess_train(train_df, verbose=verbose)
if verbose:
integer_cats = []
for col in preproc.cat_names:
if train_type_dict.get(col, None) == "integer":
integer_cats.append(col)
if integer_cats:
print(
f"\nThe following integer column(s) are being treated as categorical variables:\n{integer_cats}\n"
+ "To treat any of these column(s) as numerical, cast the column to float in DataFrame or CSV\n and re-run tabular_from* function.\n"
)
val = None if val_df is None else preproc.preprocess_test(val_df, verbose=verbose)
return (trn, val, preproc)
def tabular_from_csv(
train_csv,
label_columns=[],
date_columns=[],
val_csv=None,
val_pct=0.1,
index_col=None,
is_regression=False,
max_card=20,
random_state=None,
):
"""
```
Loads tabular data from CSV file
```
"""
# read in dataset
train_df = pd.read_csv(train_csv, index_col=index_col)
val_df = None
if val_csv is not None:
val_df = pd.read_csv(val_csv, index_col=index_col)
return tabular_from_df(
train_df,
label_columns=label_columns,
date_columns=date_columns,
val_df=val_df,
val_pct=val_pct,
is_regression=is_regression,
max_card=max_card,
random_state=random_state,
)