In [68]:
import os
import nltk
import numpy as np


In [69]:
corpora = ['aurthur_conan_doyle', 'charles_dickens', 'h_g_wells', 'jane_austen', 'jonathan_swift']
common_words = ['the','be','to','of','and','a','in','that','have','I','it','for','not','on','with','he','as','you','do','at','this','but','his','by','from','they','we','say','her','she','or','an','will','my','one','all','would','there','their','what','so','up','out','if','about','who','get','which','go','me','when','make','can','like','time','no','just','him','know','take','people','into','year','your','good','some','could','them','see','other','than','then','now','look','only','come','its','over','think','also','back','after','use','two','how','our','work','first','well','way','even','new','want','because','any','these','give','day','most','us']
punct = [".", "?", "!", ",", ";", ":", "-", "“", "’"]


In [70]:
input_data = {}
for corpus in corpora:
  with open(os.path.join("corpora", corpus), 'r') as file:
    input_data[corpus] = file.read()


In [71]:
class StyloAnalysis(object):
  def __init__(self, corpus: str, raw: str) -> None:
    super().__init__()
    self.raw = raw.lower()
    self.corpus = corpus  

  @property
  def tokens(self):
    return nltk.word_tokenize(self.raw)

  @property
  def words(self):
    return [token for token in self.tokens if token not in punct]

  @property
  def unique_words(self):
    return set(self.words)

  @property
  def text(self):
    return nltk.text.Text(self.tokens)

  @property
  def word_text(self):
    return nltk.text.Text(self.words)

  @property
  def type_token_ratio(self):
    return len(self.unique_words) / len(self.words)

  @property
  def token_freq(self):
    return nltk.probability.FreqDist(self.text)

  @property
  def word_freq(self):
    return nltk.probability.FreqDist(self.word_text)

  @property
  def av_word_len(self):
    return np.mean([len(word) for word in self.words])

  def token_per_1000(self, token: str):
    return self.token_freq[token.lower()] / len(self.text) * 1000

  def word_per_1000(self, word: str):
    return self.word_freq[word.lower()] / len(self.word_text) * 1000
  
  @property
  def common_words_per_1000(self):
    return [(word, self.word_per_1000(word)) for word in common_words]

  @property
  def punct_per_1000(self):
    return [(word, self.token_per_1000(word)) for word in punct]

  @property
  def sentences(self):
    return nltk.sent_tokenize(self.raw)

  @property
  def av_words_per_sent(self):
    return np.mean([len(nltk.word_tokenize(sent)) for sent in self.sentences])

  @property
  def tagged_tokens(self):
    return nltk.pos_tag(self.tokens)

  @property
  def tags(self):
    return [tag for (token, tag) in self.tagged_tokens if token not in punct]

  @property
  def tag_freq(self):
    return nltk.probability.FreqDist(self.tags)

  def tag_per_100(self, tag: str):
    return self.tag_freq[tag] / len(self.tags) * 100



In [72]:
analyzed_data = {}
for corpus in corpora:
  analyzed_data[corpus] = StyloAnalysis(corpus, input_data[corpus])


In [67]:
analyzed_data['h_g_wells'].word_per_1000('the')


66.83921681405324