In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd

from authorship_tool.types_ import Para2dStr, Tag
from authorship_tool.util.feature.dataset_generator import (
    ParagraphFeatureDatasetGenerator,
)
from authorship_tool.util.feature.pos import PosFeature
from dataclasses import dataclass
from authorship_tool.util.path_util import BasePaths

In [None]:
np.seterr(divide="call")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
@dataclass(frozen=True, init=False)
class ManualDatasetPath(BasePaths):
    basename: str = "manual"


ManualDatasetPath.init_paths()

In [None]:
paras_a: list[Para2dStr] = []

In [None]:
paras_b: list[Para2dStr] = []

In [None]:
all_paras: list[Para2dStr] = paras_a + paras_b
all_pos: tuple[Tag, ...] = PosFeature(all_paras).tag_subcategories().all_pos
print(all_pos)

In [None]:
dataset_generator = ParagraphFeatureDatasetGenerator(tags=all_pos)

In [None]:
para_ans_pairs: tuple[tuple[Para2dStr, np.bool_], ...] = tuple(
    (para, np.bool_(True)) for para in paras_a
) + tuple((para, np.bool_(False)) for para in paras_b)

In [None]:
dataset_tuple = tuple(
    dataset_generator.generate_from_paragraph(para, answer)
    for para, answer in para_ans_pairs
)

In [None]:
datasets: pd.DataFrame = pd.concat(dataset_tuple, axis=1).reset_index(drop=True).T

In [None]:
datasets.columns = (*dataset_generator.columns, "answer")
for col, dtype in zip(
    datasets.columns,
    dataset_generator.dtypes + [bool],
    strict=True,
):
    datasets[col] = datasets[col].astype(dtype)

In [None]:
display(datasets.head(10))

In [None]:
print(datasets.shape)

In [None]:
print(datasets.dtypes)

In [None]:
print(datasets.isna().sum())

In [None]:
datasets.to_csv(ManualDatasetPath.dataset_output_dir.joinpath("dataset.csv"))