In [None]:
# Install stylometrix and dependencies
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_trf
!python -m pip install stylo_metrix

In [None]:
import os
import pandas as pd
import spacy
import stylo_metrix
import time
import json
import random
nlp = spacy.load('en_core_web_trf')  # for English
nlp.add_pipe("stylo_metrix")

In [27]:
def get_text_info(text):
    # only keep the values
    return list(map(lambda x: x["value"], list(nlp(text)._.stylo_metrix_vector)))

In [None]:
# Feature extraction

# get all the files
path_main = 'data/'
n = 1000  # chunk length

works_files = {
    "austen": [],
    # "carroll": [],
    # "dickens": [],
    # "kipling": [],
    # "poe": [],
    "shakespeare": [],
    "twain": [],
    # "verne": [],
    "wells": []
}

for author in works_files:
    path = path_main + author
    for root, dirs, files in os.walk(path):
        for file in files:
            works_files[author].append(os.path.join(root, file))

# convert the files to 1000-character chunks
works_chunks = {
    "austen": [],
    # "carroll": [],
    # "dickens": [],
    # "kipling": [],
    # "poe": [],
    "shakespeare": [],
    "twain": [],
    # "verne": [],
    "wells": []
}

for author in works_chunks:
    for file in works_files[author]:
        if ".DS_Store" in file:
            continue
        with open(file, "r") as open_file:
            # there are lots of unnecessary line breaks
            text = open_file.read().replace("\n", " ")
            sentences = text.split(".")
            newchunks = []
            newchunk = ""
            for sentence in sentences:
                newchunk += sentence + "."  # removed during splitting
                if len(newchunk) >= n:
                    newchunks.append(newchunk)
                    newchunk = ""

            works_chunks[author] += newchunks

altmax = round(4000000/n)
numchunks = min(len(works_chunks["austen"]), altmax)+min(len(works_chunks["shakespeare"]),
                                                         altmax)+min(len(works_chunks["twain"]), altmax)+min(len(works_chunks["wells"]), altmax)


# convert the chunks to stylometric lists
works_features = {
    "austen": [],
    # "carroll": [],
    # "dickens": [],
    # "kipling": [],
    # "poe": [],
    "shakespeare": [],
    "twain": [],
    # "verne": [],
    "wells": []
}

i = 0
start_time = time.time()
for author in works_features:
    print(author)
    chunknum = 0
    for chunk in works_chunks[author]:
        try:
            works_features[author].append(get_text_info(chunk))

            chunknum += 1
            if chunknum == altmax:
                break
            i += 1
            if i % 10 == 0:
                print(
                    f"{i} / {numchunks}, ~{round((((time.time()-start_time)/i)*(numchunks-i))/60)} minutes left")

        except Exception as e:
            print(f"error at {i}: {e}")

In [None]:
# Split data into training and testing data

# 1 = only train, 0 = only test
train_test_ratio = 0.5

#[train, test]
split_data = [{}, {}]

# randomise data
for author in works_features:
    print(author)

    random.shuffle(works_features[author])
    data_length = len(works_features[author])
    split_data[0][author] = works_features[author][0:round(data_length*train_test_ratio)]
    split_data[1][author] = works_features[author][round(
        data_length*train_test_ratio):data_length]

In [None]:
# Convert raw data format to rectangular data format and save to file

path_main = ''

# training data
new_format = []
author_index = 0
for author in split_data[0]:
    for entry in split_data[0][author]:
        new_format.append(entry + [author_index])
    author_index += 1
random.shuffle(new_format)
with open(path_main+"train_rectangle.json", "w+") as test_file:
    json.dump(new_format, test_file)


# testing data
new_format = []
author_index = 0
for author in split_data[1]:
    for entry in split_data[1][author]:
        new_format.append(entry + [author_index])
    author_index += 1
random.shuffle(new_format)
with open(path_main+"test_rectangle.json", "w+") as test_file:
    json.dump(new_format, test_file)