In [97]:
import os
import nltk
import numpy as np


In [103]:
corpora = ['aurthur_conan_doyle', 'charles_dickens', 'h_g_wells', 'jane_austen', 'jonathan_swift']
common_words = ['the','be','to','of','and','a','in','that','have','I','it','for','not','on','with','he','as','you','do','at','this','but','his','by','from','they','we','say','her','she','or','an','will','my','one','all','would','there','their','what','so','up','out','if','about','who','get','which','go','me','when','make','can','like','time','no','just','him','know','take','people','into','year','your','good','some','could','them','see','other','than','then','now','look','only','come','its','over','think','also','back','after','use','two','how','our','work','first','well','way','even','new','want','because','any','these','give','day','most','us']
punct = [".", "?", "!", ",", ";", ":", "-", "\""]


In [104]:
input_data = {}
for corpus in corpora:
  with open(os.path.join("corpora", corpus), 'r') as file:
    input_data[corpus] = file.read()


In [105]:
class StyloAnalysis(object):
  def __init__(self, corpus: str, raw: str) -> None:
    super().__init__()
    raw = raw.lower()
    self.corpus = corpus
    self.words = nltk.word_tokenize(raw)
    self.uniq = set(self.words)
    self.text = nltk.text.Text(self.words)
    self.freq = nltk.probability.FreqDist(self.text)
    self.ttr = len(self.uniq) / len(self.words)
    self.av_word_len = 0 # TODO
    self.sents = nltk.sent_tokenize(raw)
    self.av_words_per_sent = 0 # TODO

  def token_per_1000(self, token):
    return self.text.count(token.lower()) / len(self.text) * 1000
  
  def common_words_per_1000(self):
    return [(word, self.token_per_1000(word)) for word in common_words]

  def punct_per_1000(self):
    return [(word, self.token_per_1000(word)) for word in punct]




In [106]:
analyzed_data = {}
for corpus in corpora:
  analyzed_data[corpus] = StyloAnalysis(corpus, input_data[corpus])

print(analyzed_data)


{'aurthur_conan_doyle': <__main__.StyloAnalysis object at 0x7f0c544e9be0>, 'charles_dickens': <__main__.StyloAnalysis object at 0x7f0c544d99d0>, 'h_g_wells': <__main__.StyloAnalysis object at 0x7f0bdfc93970>, 'jane_austen': <__main__.StyloAnalysis object at 0x7f0c544e96a0>, 'jonathan_swift': <__main__.StyloAnalysis object at 0x7f0c544e9fd0>}


In [107]:
analyzed_data['h_g_wells'].common_words_per_1000()
analyzed_data['h_g_wells'].punct_per_1000()


[('.', 47.50370483928663),
 ('?', 3.8939138433236242),
 ('!', 4.895497981501355),
 (',', 59.73733966988604),
 (';', 2.401757882364965),
 (':', 0.511012315396801),
 ('-', 0.0),
 ('"', 0.0)]