# Text Complexity Analysis

In [3]:
# Standard imports
import numpy as np
import pandas as pd
import spacy

In [18]:
from TRUNAJOD import surface_proxies
from TRUNAJOD.entity_grid import EntityGrid
from TRUNAJOD.lexico_semantic_norms import LexicoSemanticNorm
import pickle
import spacy
import tarfile


class ModelLoader(object):
    """Class to load model."""
    def __init__(self, model_file):
        tar = tarfile.open(model_file, "r:gz")
        self.crea_frequency = {}
        self.infinitive_map = {}
        self.lemmatizer = {}
        self.spanish_lexicosemantic_norms = {}
        self.stopwords = {}
        self.wordnet_noun_synsets = {}
        self.wordnet_verb_synsets = {}

        for member in tar.getmembers():
            f = tar.extractfile(member)
            if "crea_frequency" in member.name:
                self.crea_frequency = pickle.loads(f.read())
            if "infinitive_map" in member.name:
                self.infinitive_map = pickle.loads(f.read())
            if "lemmatizer" in member.name:
                self.lemmatizer = pickle.loads(f.read())
            if "spanish_lexicosemantic_norms" in member.name:
                self.spanish_lexicosemantic_norms = pickle.loads(f.read())
            if "stopwords" in member.name:
                self.stopwords = pickle.loads(f.read())
            if "wordnet_noun_synsets" in member.name:
                self.wordnet_noun_synsets = pickle.loads(f.read())
            if "wordnet_verb_synsets" in member.name:
                self.wordnet_verb_synsets = pickle.loads(f.read())

In [19]:
# Load TRUNAJOD models
model = ModelLoader("trunajod_models_v0.1.tar.gz")

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

In [9]:
def load_text(file_path: str) -> str:
    """
    Loads the text from a file and returns it as a string.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

# Load text
colonization_address = load_text("data/colonization_address.txt")
house_divided = load_text("data/house_divided.txt")

In [11]:
colonization_doc = nlp(colonization_address)
house_divided_doc = nlp(house_divided)
labels = ["Colonization Address", "House Divided"]

In [15]:
# Count tokens in each document
colonization_token_count = len(colonization_doc)
house_divided_token_count = len(house_divided_doc)

# Print token counts
print(f"Token count in {labels[0]}: {colonization_token_count}")
print(f"Token count in {labels[1]}: {house_divided_token_count}")

Token count in Colonization Address: 1880
Token count in House Divided: 3619


In [21]:
for i, doc in enumerate([colonization_doc, house_divided_doc]):

    # Lexico-semantic norms
    lexico_semantic_norms = LexicoSemanticNorm(
        doc,
        model.lemmatizer
    )

    # Frequency index
    freq_index = surface_proxies.frequency_index(doc, model.crea_frequency)

    # Clause count (heurístically)
    clause_count = surface_proxies.clause_count(doc, model.infinitive_map)

    # Compute Entity Grid
    egrid = EntityGrid(doc)

    print("\n")
    print(labels[i])
    # Print document statistics
    print(f"Document statistics:")
    print(f"  - Number of sentences: {len(list(doc.sents))}")
    print(f"  - Number of tokens: {len(doc)}")
    print(f"  - Number of unique tokens: {len(set(doc))}")
    print(f"  - Number of named entities: {len([ent for ent in doc.ents if ent.label_ == 'PERSON'])}")


    print("Concreteness: {}".format(lexico_semantic_norms.get_concreteness()))
    print("Frequency Index: {}".format(freq_index))
    print("Clause count: {}".format(clause_count))
    print("Entity grid:")
    print(egrid.get_egrid())



Colonization Address
Document statistics:
  - Number of sentences: 90
  - Number of tokens: 1880
  - Number of unique tokens: 1880
  - Number of named entities: 2
Concreteness: 0
Frequency Index: 0.0499327087767838
Clause count: 0
Entity grid:
{'YOU': ['S', '-', 'S', 'X', '-', 'O', '-', '-', 'X', '-', 'O', '-', '-', 'O', '-', '-', '-', 'O', '-', 'S', '-', '-', 'S', 'X', 'S', '-', '-', '-', 'O', 'S', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 'O', 'S', 'S', 'X', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 'S', 'S', '-', 'X', '-', 'S', 'S', '-', 'O', 'S', '-', '-', '-', 'O', 'S', 'S', '-', '-', '-', '-', 'S', '-', '-', '-', '-', '-', 'S', 'O', '-', 'O'], 'ALL': ['X', '-', '-', '-', '-', '-', '-', '-', 'X', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '