In [2]:
!pip install -r ../requirements.txt

Defaulting to user installation because normal site-packages is not writeable


In [22]:
import json
from pathlib import Path

from lexical_diversity import lex_div as ld
import neurokit2 as nk
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import spacy
from tqdm import tqdm

from utils import *

pd.options.mode.chained_assignment = None

### Set-up 

In [16]:
# CHANGE THESE TO YOUR PREFERENCES
in_dir = "../../test_files/"
out_dir = "../output"
language = "english"
sentiment_method = "vader"

In [5]:
# ENGLISH
!python3 -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
# DANISH
!python3 -m spacy download da_core_news_sm

nlp = spacy.load("da_core_news_sm")

In [6]:
nlp.max_length = 3500000

nltk.download("punkt")


[nltk_data] Downloading package punkt to /Users/au643202/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Getting features

In [25]:
# first check that the in_dir actually has texts 
filelist = list(Path(in_dir).glob("*.txt"))

if filelist == []:
        raise ValueError(
            "The folder specified as --in_dir containes no .txt files. Check the path is correct"
        )

# also, ensure that out_dir exists
Path(out_dir).mkdir(exist_ok=True)

In [14]:
master_dict = {}

In [26]:
for filename in tqdm(filelist, total=len(filelist)):
    temp = {}
    # extract text and tokenize
    text = extract_text(filename)
    sents = sent_tokenize(text, language=language)
    words = word_tokenize(text, language=language)

    # get spacy attributes
    spacy_attributes = []
    for token in nlp(text):
        token_attributes = get_spacy_attributes(token)
        spacy_attributes.append(token_attributes)

    spacy_df = create_spacy_df(spacy_attributes)

    save_spacy_df(spacy_df, filename, out_dir)

    # stylometrics
    # for words
    temp["word_count"] = len(words)
    temp["average_wordlen"] = avg_wordlen(words)
    temp["msttr"] = ld.msttr(words, window_length=100)

    # for sentences
    if len(sents) < 1502:
        print(f"\n{filename.name}")
        print("text not long enough for stylometrics\n")
        pass
    else:
        temp["average_sentlen"] = avg_sentlen(sents)
        temp["gzipr"], temp["bzipr"] = compressrat(sents)

    # bigram and word entropy
    try:
        temp["bigram_entropy"], temp["word_entropy"] = text_entropy(
            text, language=language, base=2, asprob=False
        )
    except:
        print(f"\n{filename.name}")
        print("error in bigram and/or word entropy\n")
        pass

    arc = get_sentarc(sents, sent_method=sentiment_method, lang=language)

    # basic sentiment features
    if len(arc) < 60:
        print(f"\n{filename.name}")
        print("arc not long enough for basic sentiment features\n")
        pass
    else:
        (
            temp["mean_sentiment"],
            temp["std_sentiment"],
            temp["mean_sentiment_per_segment"],
            temp["mean_sentiment_first_ten_percent"],
            temp["mean_sentiment_last_ten_percent"],
            temp["difference_lastten_therest"],
        ) = get_basic_sentarc_features(arc)

    # approximate entropy
    try:
        temp["approximate_entropy"] = nk.entropy_approximate(
            arc, dimension=2, tolerance="sd"
        )
    except:
        print(f"\n{filename.name}")
        print("error with approximate entropy\n")
        pass

    # hurst
    try:
        temp["hurst"] = get_hurst(arc)
    except:
        print(f"\n{filename.name}")
        print("error with hurst\n")
        pass

    # doing the things that only work in English
    if language == "english":
        # readability
        try:
            (
                temp["flesch_grade"],
                temp["flesch_ease"],
                temp["smog"],
                temp["ari"],
                temp["dale_chall_new"],
            ) = text_readability(text)

        except:
            print(f"\n{filename.name}")
            print("error in readability\n")
            pass

        # roget
        all_roget_categories = roget.list_all_categories()

        roget_df = filter_spacy_df(spacy_df)

        temp["roget_n_tokens"] = len(spacy_df)
        temp["roget_n_tokens_filtered"] = len(roget_df)

        token_categories = get_token_categories(roget_df)
        doc_categories = re.findall(r"(rog\d{3} \w*)", token_categories)

        for roget_cat in all_roget_categories:
            temp[roget_cat] = doc_categories.count(roget_cat)

        temp["roget_n_cats"] = len(doc_categories)

        # save arc
    temp["arc"] = arc

    # saving it all
    master_dict[filename.stem] = temp




 50%|█████     | 1/2 [00:13<00:13, 13.66s/it]


00022345.txt
text not long enough for stylometrics



100%|██████████| 2/2 [00:15<00:00,  7.80s/it]


In [21]:
# save the thing
with open(Path(out_dir).joinpath("books_features.json"), "w") as f:
        json.dump(master_dict, f)