In [1]:
import spacy
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline

Matplotlib is building the font cache; this may take a moment.


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
file = "sample.txt"
text = open(file, encoding='utf-8').read()
doc = nlp(text)

In [4]:
print([token.text for token in doc])

['This', 'is', 'a', 'text', 'document', '.', '\n\n', 'It', 'is', 'simple', ',', 'so', 'I', 'can', 'easily', 'figure', 'out', 'its', 'stats', '.', '\n\n', '"', 'This', 'is', 'a', 'line', 'of', 'dialogue', '.', '"', '\n\n', 'This', 'is', 'not', '.', '\n\n', '"', 'This', 'is', 'a', 'more', 'complicated', 'line', 'of', 'dialogue', ',', '"', 'I', 'say', ',', '"', 'because', 'it', 'spans', 'a', 'speech', 'tag', '.', 'That', 'may', 'be', 'a', 'problem', 'for', 'my', 'program', '.', '"', '\n\n', 'Here', 'is', 'a', 'simple', 'sentence', '.', '\n\n', 'Here', ',', 'if', 'you', 'care', 'to', 'look', ',', 'is', 'what', 'you', 'might', 'call', ',', 'if', 'you', "'re", 'so', 'inclined', ',', 'a', 'complex', 'sentence', '.', 'It', 'has', 'many', 'more', 'commas', 'than', 'my', 'other', 'sentences', '.', 'Those', 'do', 'not', 'have', 'as', 'many', 'commas', 'as', 'that', 'one', '.', '\n\n', '"', 'Here', 'are', 'some', 'more', 'examples', 'of', 'dialogue', ',', '"', 'I', 'say', '.', '"', 'They', 'should

In [5]:
words_with_quotations = [token.text for token in doc if (not token.is_punct or token.text=='"') and not token.text=="\n\n"]

In [6]:
word_count = 0
diag_count = 0
diag = False
for i in words_with_quotations:
    if i == '"':
        diag = not diag
        continue
    if diag:
        diag_count += 1
    word_count += 1
    
print(word_count, diag_count, diag_count / word_count)

129 41 0.3178294573643411


In [13]:
def get_dialogue_proportion(doc):
    """Takes a spacy doc and returns a 3-tuple of word count, dialogue word count, and the proportion of dialogue to total.
    """
    words_with_quotations = [token.text for token in doc if (not token.is_punct or token.text=='"' or token.text!="\n\n" or token.text!="\n")]
    word_count = 0
    diag_count = 0
    diag = False
    for i in words_with_quotations:
        if i == '"':
            diag = not diag
            continue
        if diag:
            diag_count += 1
        word_count += 1
    
    return (word_count, diag_count, word_count/diag_count)

In [8]:
sentences = list(doc.sents)
print(len(sentences))
for i in sentences:
    print(i)

17
This is a text document.


It is simple, so I can easily figure out its stats.


"This is a line of dialogue."


This is not.


"This is a more complicated line of dialogue," I say, "because it spans a speech tag.
That may be a problem for my program."


Here is a simple sentence.


Here, if you care to look, is what you might call, if you're so inclined, a complex sentence.
It has many more commas than my other sentences.
Those do not have as many commas as that one.


"Here are some more examples of dialogue," I say.
"They should cover all the possibilities."


This... might be a problem.
How will it handle things...?
I don't know.


And this....


What does it think of it?


In [9]:
def partition_text_on_period(text):
    chunks = []
    while len(text) > 60000:
        split_at = 50000
        while text[split_at] != ".":
            split_at += 1
        chunks.append(text[0:split_at+1])
        text = text[split_at+1:]
    chunks.append(text)
    return chunks

In [10]:
def chunk_test(text):
    chunks = []
    while len(text) > 250:
        split_at = 200
        while text[split_at] != ".":
            split_at += 1
        chunks.append(text[0:split_at+1])
        text = text[split_at+1:]
    chunks.append(text)
    return chunks

for i in chunk_test(text):
    print("\n HERE'S A CHUNK \n")
    print(len(i))
    print(i)


 HERE'S A CHUNK 

209
This is a text document.

It is simple, so I can easily figure out its stats.

"This is a line of dialogue."

This is not.

"This is a more complicated line of dialogue," I say, "because it spans a speech tag.

 HERE'S A CHUNK 

211
 That may be a problem for my program."

Here is a simple sentence.

Here, if you care to look, is what you might call, if you're so inclined, a complex sentence. It has many more commas than my other sentences.

 HERE'S A CHUNK 

213
 Those do not have as many commas as that one.

"Here are some more examples of dialogue," I say. "They should cover all the possibilities."

This... might be a problem. How will it handle things...? I don't know.

 HERE'S A CHUNK 

41


And this....

What does it think of it?


In [11]:
para_count = 1
for i in [token.text for token in doc]:
    if i == "\n\n" or i == "\n":
        para_count += 1
print(para_count)

11


In [12]:
print([i.lemma_ for i in doc if not i.is_punct and i.text not in ["\n", "\n\n"]])
print(len(set([i.lemma_ for i in doc if not i.is_punct and i.text not in ["\n", "\n\n"]])))

['this', 'be', 'a', 'text', 'document', '-PRON-', 'be', 'simple', 'so', '-PRON-', 'can', 'easily', 'figure', 'out', '-PRON-', 'stat', 'this', 'be', 'a', 'line', 'of', 'dialogue', 'this', 'be', 'not', 'this', 'be', 'a', 'more', 'complicated', 'line', 'of', 'dialogue', '-PRON-', 'say', 'because', '-PRON-', 'span', 'a', 'speech', 'tag', 'that', 'may', 'be', 'a', 'problem', 'for', '-PRON-', 'program', 'here', 'be', 'a', 'simple', 'sentence', 'here', 'if', '-PRON-', 'care', 'to', 'look', 'be', 'what', '-PRON-', 'may', 'call', 'if', '-PRON-', 'be', 'so', 'inclined', 'a', 'complex', 'sentence', '-PRON-', 'have', 'many', 'more', 'comma', 'than', '-PRON-', 'other', 'sentence', 'those', 'do', 'not', 'have', 'as', 'many', 'comma', 'as', 'that', 'one', 'here', 'be', 'some', 'more', 'example', 'of', 'dialogue', '-PRON-', 'say', '-PRON-', 'should', 'cover', 'all', 'the', 'possibility', 'this', 'may', 'be', 'a', 'problem', 'how', 'will', '-PRON-', 'handle', 'thing', '-PRON-', 'do', 'not', 'know', 'an