In [16]:
import nltk

nltk.download("gutenberg")
nltk.download("inaugural")

[nltk_data] Downloading package gutenberg to /home/vscode/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package inaugural to /home/vscode/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!


True

In [17]:
from typing import Final, assert_type

import numpy as np
import pandas as pd
from authorship_tool.types import Para2dStr, Tag
from authorship_tool.util.feature.dataset_generator import (
    ParagraphFeatureDatasetGenerator,
)
from authorship_tool.util.feature.pos import PosFeature
from authorship_tool.util.path_util import PathUtil
from nltk.corpus import gutenberg, inaugural

In [18]:
np.seterr(divide="call")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
dataset_dir = PathUtil.DATASET_DIR.joinpath("manual")
dataset_dir.mkdir(parents=True, exist_ok=True)

In [19]:
# コーパスからパラグラフを取得
# Obtaining paragraphs from corpus
AUTHOR_A: Final[str] = "chesterton"
AUTHOR_B: Final[str] = "bryant"

articles_a: list[list[Para2dStr]] = [
    gutenberg.paras(fileids=file_id)
    for file_id in gutenberg.fileids()
    if AUTHOR_A in file_id
]
assert_type(articles_a, list[list[Para2dStr]])

paras_a: list[Para2dStr] = [para for article in articles_a for para in article]
assert_type(paras_a, list[Para2dStr])

[[['[',
   'The',
   'Ball',
   'and',
   'The',
   'Cross',
   'by',
   'G',
   '.',
   'K',
   '.',
   'Chesterton',
   '1909',
   ']']],
 [['I', '.'], ['A', 'DISCUSSION', 'SOMEWHAT', 'IN', 'THE', 'AIR']],
 [['The',
   'flying',
   'ship',
   'of',
   'Professor',
   'Lucifer',
   'sang',
   'through',
   'the',
   'skies',
   'like',
   'a',
   'silver',
   'arrow',
   ';',
   'the',
   'bleak',
   'white',
   'steel',
   'of',
   'it',
   ',',
   'gleaming',
   'in',
   'the',
   'bleak',
   'blue',
   'emptiness',
   'of',
   'the',
   'evening',
   '.'],
  ['That',
   'it',
   'was',
   'far',
   'above',
   'the',
   'earth',
   'was',
   'no',
   'expression',
   'for',
   'it',
   ';',
   'to',
   'the',
   'two',
   'men',
   'in',
   'it',
   ',',
   'it',
   'seemed',
   'to',
   'be',
   'far',
   'above',
   'the',
   'stars',
   '.'],
  ['The',
   'professor',
   'had',
   'himself',
   'invented',
   'the',
   'flying',
   'machine',
   ',',
   'and',
   'had',
   'also

In [20]:
articles_b: list[list[Para2dStr]] = [
    gutenberg.paras(fileids=file_id)
    for file_id in gutenberg.fileids()
    if AUTHOR_B in file_id
]
assert_type(articles_b, list[list[Para2dStr]])

paras_b: list[Para2dStr] = [para for article in articles_b for para in article]
assert_type(paras_b, list[Para2dStr])

[[['[',
   'Stories',
   'to',
   'Tell',
   'to',
   'Children',
   'by',
   'Sara',
   'Cone',
   'Bryant',
   '1918',
   ']']],
 [['TWO', 'LITTLE', 'RIDDLES', 'IN', 'RHYME']],
 [['There',
   "'",
   's',
   'a',
   'garden',
   'that',
   'I',
   'ken',
   ',',
   'Full',
   'of',
   'little',
   'gentlemen',
   ';',
   'Little',
   'caps',
   'of',
   'blue',
   'they',
   'wear',
   ',',
   'And',
   'green',
   'ribbons',
   ',',
   'very',
   'fair',
   '.'],
  ['(', 'Flax', '.)']],
 [['From',
   'house',
   'to',
   'house',
   'he',
   'goes',
   ',',
   'A',
   'messenger',
   'small',
   'and',
   'slight',
   ',',
   'And',
   'whether',
   'it',
   'rains',
   'or',
   'snows',
   ',',
   'He',
   'sleeps',
   'outside',
   'in',
   'the',
   'night',
   '.'],
  ['(', 'The', 'path', '.)']],
 [['THE', 'LITTLE', 'YELLOW', 'TULIP']],
 [['Once',
   'there',
   'was',
   'a',
   'little',
   'yellow',
   'Tulip',
   ',',
   'and',
   'she',
   'lived',
   'down',
   'in',
   'a

In [21]:
all_paras: list[Para2dStr] = paras_a + paras_b
all_pos: tuple[Tag, ...] = PosFeature(all_paras).tag_subcategories().all_pos
print(all_pos)

('$', "''", '(', ')', ',', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'JJ_pp', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``')


In [22]:
dataset_generator = ParagraphFeatureDatasetGenerator(tags=all_pos)

In [33]:
para_ans_pairs: tuple[tuple[Para2dStr, np.bool_], ...] = tuple(
    (para, np.bool_(True)) for para in paras_a
) + tuple((para, np.bool_(False)) for para in paras_b)

In [34]:
dataset_tuple = tuple(
    dataset_generator.generate_from_paragraph(para, answer)
    for para, answer in para_ans_pairs
)

In [35]:
datasets: pd.DataFrame = pd.concat(dataset_tuple, axis=1).reset_index(drop=True).T

In [43]:
datasets.columns = (*dataset_generator.columns, "answer")
for col, dtype in zip(datasets.columns, dataset_generator.dtypes + (np.bool_,)):
    datasets[col] = datasets[col].astype(dtype)

In [45]:
display(datasets.head(10))

Unnamed: 0,v1 sentences per paragraph,v2 words per paragraph,v3 close parenthesis present,v4 dash present,v5 semi-colon or colon present,v6 question mark present,v7 apostrophe present,v8 standard deviation of sentence length,v9 length difference for consecutive sentences,v10 sentence with < 11 words,v11 sentence with > 34 words,v12 contains although,v13 contains however,v14 contains but,v15 contains because,v16 contains this,v17 contains others or researchers,v18 contains numbers,v19 contains 2 times more capitals than period,v20 contains et,word variation,average token length,non alphabetic characters frequency,uncommon word frequency,non-alphabetic characters frequency,numeric value frequency,$,'',(,),",",.,:,CC,CD,DT,EX,FW,IN,JJ,JJR,JJS,JJ_pp,MD,NN,NNP,NNPS,NNS,PDT,POS,PRP,PRP$,RB,RBR,RBS,RP,SYM,TO,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,``,answer
0,1,14,False,False,False,False,False,0.0,0.0,False,False,False,False,False,False,False,False,True,True,False,0.857143,2.857143,0.571429,0.857143,0.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.071429,0.071429,0.142857,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.071429,0.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,2,8,False,False,False,False,False,2.0,4.0,True,False,False,False,False,False,False,False,False,True,False,1.0,3.625,0.125,1.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,5,164,False,False,True,False,False,12.983066,13.25,False,True,False,False,False,False,False,False,False,False,False,0.573171,4.103659,0.091463,0.530488,0.091463,0.0,0.0,0.0,0.0,0.0,0.042683,0.030488,0.018293,0.04878,0.006098,0.134146,0.0,0.006098,0.176829,0.073171,0.006098,0.0,0.0,0.0,0.164634,0.012195,0.0,0.036585,0.0,0.0,0.060976,0.0,0.036585,0.0,0.0,0.0,0.0,0.02439,0.0,0.006098,0.042683,0.018293,0.012195,0.012195,0.02439,0.006098,0.0,0.0,0.0,0.0,True
3,6,149,False,True,True,False,False,4.669642,6.2,False,False,False,False,True,False,True,False,False,False,False,0.637584,4.389262,0.120805,0.590604,0.120805,0.0,0.0,0.0,0.0,0.0,0.067114,0.040268,0.013423,0.013423,0.013423,0.120805,0.0,0.0,0.100671,0.053691,0.0,0.006711,0.0,0.006711,0.127517,0.013423,0.0,0.04698,0.013423,0.0,0.04698,0.013423,0.060403,0.0,0.0,0.0,0.0,0.020134,0.0,0.020134,0.087248,0.006711,0.067114,0.013423,0.0,0.026846,0.0,0.0,0.0,0.0,True
4,8,294,True,False,True,False,False,18.699933,22.857143,False,True,False,True,True,False,False,False,True,False,False,0.544218,4.12585,0.136054,0.57483,0.136054,0.0,0.0,0.0,0.003401,0.003401,0.085034,0.027211,0.003401,0.037415,0.010204,0.088435,0.003401,0.0,0.115646,0.105442,0.0,0.0,0.0,0.003401,0.122449,0.010204,0.003401,0.034014,0.003401,0.0,0.064626,0.020408,0.091837,0.0,0.003401,0.003401,0.0,0.017007,0.0,0.02381,0.068027,0.006803,0.030612,0.003401,0.0,0.003401,0.0,0.003401,0.0,0.0,True
5,5,91,False,True,False,False,False,13.511477,15.25,True,True,False,False,False,False,True,False,False,False,False,0.758242,4.637363,0.175824,0.593407,0.175824,0.0,0.0,0.0,0.0,0.0,0.032967,0.043956,0.0,0.021978,0.0,0.076923,0.0,0.0,0.131868,0.054945,0.0,0.0,0.0,0.010989,0.142857,0.054945,0.0,0.065934,0.0,0.0,0.054945,0.021978,0.054945,0.0,0.0,0.0,0.0,0.054945,0.0,0.054945,0.010989,0.032967,0.010989,0.021978,0.032967,0.010989,0.0,0.0,0.0,0.0,True
6,1,50,False,False,True,True,False,0.0,0.0,False,True,False,False,True,False,True,False,False,True,False,0.86,3.48,0.18,0.58,0.18,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.02,0.02,0.0,0.06,0.0,0.0,0.18,0.02,0.0,0.0,0.0,0.04,0.14,0.02,0.0,0.06,0.0,0.0,0.16,0.02,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.04,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,True
7,14,313,True,False,False,False,True,10.539769,12.923077,True,True,False,False,True,False,True,False,False,False,False,0.485623,3.638978,0.162939,0.555911,0.162939,0.0,0.0,0.0,0.003195,0.0,0.089457,0.041534,0.0,0.028754,0.0,0.134185,0.003195,0.0,0.102236,0.063898,0.01278,0.003195,0.0,0.01278,0.121406,0.015974,0.0,0.038339,0.003195,0.003195,0.076677,0.015974,0.041534,0.003195,0.0,0.0,0.0,0.00639,0.0,0.025559,0.009585,0.0,0.015974,0.070288,0.028754,0.01278,0.0,0.0,0.015974,0.0,True
8,1,25,False,True,False,False,False,0.0,0.0,False,False,False,False,True,False,False,False,False,True,False,0.84,3.76,0.44,0.68,0.44,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,0.04,0.0,0.04,0.0,0.0,0.04,0.04,0.0,0.0,0.0,0.04,0.12,0.04,0.0,0.0,0.0,0.0,0.16,0.04,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.04,0.04,0.04,0.16,0.0,0.0,0.0,0.0,0.0,0.0,True
9,1,25,False,False,False,False,False,0.0,0.0,False,False,False,False,False,False,False,False,False,True,False,0.84,3.4,0.36,0.8,0.36,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.04,0.0,0.0,0.0,0.0,0.04,0.16,0.0,0.04,0.0,0.0,0.08,0.04,0.12,0.0,0.0,0.04,0.0,0.04,0.0,0.04,0.04,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,True


In [46]:
print(datasets.shape)

(5249, 71)


In [47]:
print(datasets.dtypes)

v1 sentences per paragraph                          int64
v2 words per paragraph                              int64
v3 close parenthesis present                         bool
v4 dash present                                      bool
v5 semi-colon or colon present                       bool
v6 question mark present                             bool
v7 apostrophe present                                bool
v8 standard deviation of sentence length          float64
v9 length difference for consecutive sentences    float64
v10 sentence with < 11 words                         bool
v11 sentence with > 34 words                         bool
v12 contains although                                bool
v13 contains however                                 bool
v14 contains but                                     bool
v15 contains because                                 bool
v16 contains this                                    bool
v17 contains others or researchers                   bool
v18 contains n

In [48]:
print(datasets.isna().sum())

v1 sentences per paragraph                        0
v2 words per paragraph                            0
v3 close parenthesis present                      0
v4 dash present                                   0
v5 semi-colon or colon present                    0
v6 question mark present                          0
v7 apostrophe present                             0
v8 standard deviation of sentence length          0
v9 length difference for consecutive sentences    0
v10 sentence with < 11 words                      0
v11 sentence with > 34 words                      0
v12 contains although                             0
v13 contains however                              0
v14 contains but                                  0
v15 contains because                              0
v16 contains this                                 0
v17 contains others or researchers                0
v18 contains numbers                              0
v19 contains 2 times more capitals than period    0
v20 contains

In [50]:
datasets.to_csv(dataset_dir.joinpath("dataset.csv"))