forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pandas.py
35 lines (29 loc) 路 1.33 KB
/
pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# coding=utf-8
import pandas as pd
import pyarrow as pa
import datasets
class Pandas(datasets.ArrowBasedBuilder):
def _info(self):
return datasets.DatasetInfo()
def _split_generators(self, dl_manager):
"""We handle string, list and dicts in datafiles"""
if not self.config.data_files:
raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
data_files = dl_manager.download_and_extract(self.config.data_files)
if isinstance(data_files, (str, list, tuple)):
files = data_files
if isinstance(files, str):
files = [files]
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})]
splits = []
for split_name in [datasets.Split.TRAIN, datasets.Split.VALIDATION, datasets.Split.TEST]:
if split_name in data_files:
files = data_files[split_name]
if isinstance(files, str):
files = [files]
splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
return splits
def _generate_tables(self, files):
for i, file in enumerate(files):
pa_table = pa.Table.from_pandas(pd.read_pickle(file))
yield i, pa_table