In [1]:
import textacy

text = (
    "Many years later, as he faced the firing squad, Colonel Aureliano Buendía "
    "was to remember that distant afternoon when his father took him to discover ice. "
    "At that time Macondo was a village of twenty adobe houses, built on the bank "
    "of a river of clear water that ran along a bed of polished stones, which were "
    "white and enormous, like prehistoric eggs. The world was so recent "
    "that many things lacked names, and in order to indicate them it was necessary to point."
)
doc = textacy.make_spacy_doc(text, lang="en_core_web_md")
print(doc._.preview)

2023-10-07 16:36:38.744816: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-07 16:36:41.103244: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-07 16:36:41.104263: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-

Doc(93 tokens: "Many years later, as he faced the firing squad,...")


In [2]:
from textacy import extract

list(extract.entities(doc, include_types={"PERSON", "LOCATION"}))

[Aureliano Buendía]

In [3]:
list(extract.subject_verb_object_triples(doc))


[SVOTriple(subject=[he], verb=[faced], object=[firing, squad]),
 SVOTriple(subject=[father], verb=[took], object=[him]),
 SVOTriple(subject=[things], verb=[lacked], object=[names])]

In [4]:
from textacy import text_stats as ts

ts.n_words(doc), ts.n_unique_words(doc)

(84, 66)

In [5]:
ts.diversity.ttr(doc)


0.7857142857142857

In [6]:
ts.flesch_kincaid_grade_level(doc)


10.922857142857143

In [8]:
other_doc = textacy.make_spacy_doc(
    "Finally, one Tuesday in December, at lunchtime, all at once he released the whole weight of his torment. "
    "The children would remember for the rest of their lives the august solemnity with which their father, "
    "devastated by his prolonged vigil and by the wrath of his imagination, revealed his discovery to them: "
    "'The earth is round, like an orange.'",
    lang="en_core_web_md",
)


In [9]:
from textacy import similarity

similarity.levenshtein(doc.text, other_doc.text)

0.2693965517241379

In [11]:
similarity.cosine(
    (tok.lemma_ for tok in extract.words(doc)),
    (tok.lemma_ for tok in extract.words(other_doc)),
)


0.0914991421995628

In [13]:
set(tok.text for tok in extract.words(doc)) & set(
    tok.text for tok in extract.words(other_doc)
)

{'father', 'like', 'remember'}

In [14]:
ts.flesch_reading_ease(doc) > ts.flesch_reading_ease(other_doc)

True

In [15]:
records = [
    (
        "Many years later, as he faced the firing squad, Colonel Aureliano Buendía was to remember that distant afternoon when his father took him to discover ice. At that time Macondo was a village of twenty adobe houses, built on the bank of a river of clear water that ran along a bed of polished stones, which were white and enormous, like prehistoric eggs. The world was so recent that many things lacked names, and in order to indicate them it was necessary to point.",
        {"title": "One Hundred Years of Solitude", "pub_yr": 1967},
    ),
    (
        "Over the weekend the vultures got into the presidential palace by pecking through the screens on the balcony windows and the flapping of their wings stirred up the stagnant time inside, and at dawn on Monday the city awoke out of its lethargy of centuries with the warm, soft breeze of a great man dad and rotting grandeur.",
        {"title": "The Autumn of the Patriarch", "pub_yr": 1975},
    ),
    (
        "On the day they were going to kill him, Santiago Nasar got up at five-thirty in the morning to wait for the boat the bishop was coming on. He'd dreamed he was going through a grove of timber trees where a gentle drizzle was falling, and for an instant he was happy in his dream, but when he awoke he felt completely spattered with bird shit.",
        {"title": "Chronicle of a Death Foretold", "pub_yr": 1981},
    ),
    (
        "It was inevitable: the scent of bitter almonds always reminded him of the fate of unrequited love. Dr. Juvenal Urbino noticed it as soon as he entered the still darkened house where he had hurried on an urgent call to attend a case that for him had lost all urgency many years before. The Antillean refugee Jeremiah de Saint-Amour, disabled war veteran, photographer of children, and his most sympathetic opponent in chess, had escaped the torments of memory with the aromatic fumes of gold cyanide.",
        {"title": "Love in the Time of Cholera", "pub_yr": 1985},
    ),
    (
        "José Palacios, his oldest servant, found him floating naked with his eyes open in the purifying waters of his bath and thought he had drowned. He knew this was one of the many ways the General meditated, but the ecstasy in which he lay drifting seemed that of a man no longer of this world.",
        {"title": "The General in His Labyrinth", "pub_yr": 1989},
    ),
]

In [17]:
corpus = textacy.Corpus("en_core_web_md", records)
print(corpus)


Corpus(5 docs, 383 tokens)


In [18]:
corpus.n_sents


11

In [19]:
import statistics

corpus.agg_metadata("pub_yr", statistics.median)

1981

In [20]:
sorted(corpus.word_counts(by="lemma_").items(), key=lambda x: x[1], reverse=True)

[('year', 2),
 ('time', 2),
 ('house', 2),
 ('water', 2),
 ('world', 2),
 ('get', 2),
 ('awake', 2),
 ('man', 2),
 ('go', 2),
 ('dream', 2),
 ('later', 1),
 ('face', 1),
 ('firing', 1),
 ('squad', 1),
 ('Colonel', 1),
 ('Aureliano', 1),
 ('Buendía', 1),
 ('remember', 1),
 ('distant', 1),
 ('afternoon', 1),
 ('father', 1),
 ('take', 1),
 ('discover', 1),
 ('ice', 1),
 ('Macondo', 1),
 ('village', 1),
 ('adobe', 1),
 ('build', 1),
 ('bank', 1),
 ('river', 1),
 ('clear', 1),
 ('run', 1),
 ('bed', 1),
 ('polished', 1),
 ('stone', 1),
 ('white', 1),
 ('enormous', 1),
 ('like', 1),
 ('prehistoric', 1),
 ('egg', 1),
 ('recent', 1),
 ('thing', 1),
 ('lack', 1),
 ('name', 1),
 ('order', 1),
 ('indicate', 1),
 ('necessary', 1),
 ('point', 1),
 ('weekend', 1),
 ('vulture', 1),
 ('presidential', 1),
 ('palace', 1),
 ('peck', 1),
 ('screen', 1),
 ('balcony', 1),
 ('window', 1),
 ('flapping', 1),
 ('wing', 1),
 ('stir', 1),
 ('stagnant', 1),
 ('inside', 1),
 ('dawn', 1),
 ('Monday', 1),
 ('city', 1)

In [21]:
from textacy.representations import Vectorizer

vectorizer = Vectorizer(tf_type="linear", idf_type="smooth")
doc_term_matrix = vectorizer.fit_transform(
    ((term.lemma_ for term in extract.terms(doc, ngs=1, ents=True)) for doc in corpus)
)

In [22]:
print(repr(doc_term_matrix))


<5x168 sparse matrix of type '<class 'numpy.float64'>'
	with 176 stored elements in Compressed Sparse Row format>


In [23]:
doc_term_matrix[:, vectorizer.vocabulary_terms["year"]].toarray()

array([[1.69314718],
       [0.        ],
       [0.        ],
       [1.69314718],
       [0.        ]])

In [24]:
from textacy.representations import build_cooccurrence_network

cooc_graph = build_cooccurrence_network(
    [[term.lemma_ for term in extract.terms(doc, ngs=1, ents=True)] for doc in corpus],
    window_size=5,
)
cooc_graph.number_of_nodes(), cooc_graph.number_of_edges()
sorted(cooc_graph.adjacency())[1]

('Aureliano',
 {'Colonel': {'weight': 4},
  'face': {'weight': 1},
  'firing': {'weight': 2},
  'squad': {'weight': 3},
  'Buendía': {'weight': 4},
  'remember': {'weight': 3},
  'distant': {'weight': 2},
  'afternoon': {'weight': 1}})

In [25]:
doc = textacy.make_spacy_doc(text, lang="en_core_web_md")
doc._.preview

'Doc(93 tokens: "Many years later, as he faced the firing squad,...")'

In [27]:
from textacy import extract

list(extract.ngrams(doc, 3, filter_punct=True))


[faced the firing,
 Colonel Aureliano Buendía,
 remember that distant,
 river of clear,
 water that ran,
 bed of polished,
 white and enormous,
 like prehistoric eggs,
 things lacked names,
 order to indicate,
 necessary to point]

In [28]:
list(extract.noun_chunks(doc, drop_determiners=True))

[he,
 firing squad,
 Colonel Aureliano Buendía,
 distant afternoon,
 his father,
 him,
 ice,
 time,
 Macondo,
 village,
 twenty adobe houses,
 bank,
 river,
 clear water,
 that,
 bed,
 polished stones,
 which,
 prehistoric eggs,
 world,
 many things,
 names,
 order,
 them,
 it]

In [29]:
list(extract.subject_verb_object_triples(doc))


[SVOTriple(subject=[he], verb=[faced], object=[firing, squad]),
 SVOTriple(subject=[father], verb=[took], object=[him]),
 SVOTriple(subject=[things], verb=[lacked], object=[names])]

In [30]:
extract.keyterms.textrank(doc, normalize="lemma", topn=10)

[('Colonel Aureliano Buendía', 0.03514023923288048),
 ('distant afternoon', 0.025702252401637468),
 ('prehistoric egg', 0.02413546693014402),
 ('adobe house', 0.02373257673368041),
 ('firing squad', 0.02260215506120479),
 ('clear water', 0.022531163291437),
 ('time Macondo', 0.022468625324399062),
 ('polished stone', 0.022324901110696762),
 ('father', 0.01347633614501534),
 ('thing', 0.012444923735098302)]

In [32]:
import spacy
import pytextrank

text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."

nlp = spacy.load("en_core_web_sm")
# Add PytextRank to spacy pipeline
nlp.add_pipe("textrank")
doc = nlp(text)
for phrase in doc._.phrases:
    print(phrase.text)
    print(phrase.rank, phrase.count)
    print(phrase.chunks)

mixed types
0.18359439311764025 1
[mixed types]
systems
0.1784796193107821 3
[systems, systems, systems]
minimal generating sets
0.15037838042245094 1
[minimal generating sets]
nonstrict inequations
0.14740065982407313 1
[nonstrict inequations]
strict inequations
0.13946027725597837 1
[strict inequations]
linear Diophantine equations
0.1195023546245721 1
[linear Diophantine equations]
natural numbers
0.11450088293222845 1
[natural numbers]
solutions
0.10780718173686318 3
[solutions, solutions, solutions]
linear constraints
0.10529828014583348 1
[linear constraints]
all the considered types systems
0.1036960590708142 1
[all the considered types systems]
a minimal supporting set
0.08812713074893187 1
[a minimal supporting set]
linear
0.08444534702772151 1
[linear]
a system
0.08243620500315359 1
[a system]
a minimal set
0.07944607954086784 1
[a minimal set]
algorithms
0.0763527926213032 1
[algorithms]
all types
0.07593126037016427 1
[all types]
Diophantine
0.07309361902551355 1
[Diophanti