In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re 
import codecs as cds 
import nltk as nk 
from nltk import bigrams 
import math 

In [2]:
class TextAnalyzer:
    def __init__(self, par):
        self.par = par
        self.dictionary = {
            "phrases": [],
            "tokens": [],
            "avg_phrases": [],
            "avg_words": [],
            "hapax": [],
            "ratio": [],
            "bigrams": [],
            "prob_cond": [],
            "LMI": []
        }

    def calculate_token(self):
        total_length = 0
        total_tokens = [] 
        for el in self.par:  
            tokenized = nk.word_tokenize(el) 
            total_tokens = total_tokens + tokenized 
        total_length = len(total_tokens)
        self.dictionary["tokens"].append(total_length)
        return total_length
        
    def quantity_sentences(self):
        counter = 0 
        for el in self.par:
            el = 1 
            counter += el 
        self.dictionary["phrases"].append(counter)
        return counter

    def sentences_avg(self):
        counter = 0
        for el in self.par: 
            counter += len(el) 
        counter = counter / len(self.par) 
        self.dictionary["avg_phrases"].append(counter)
        return counter

    def token_collection(self):
        total_tokens = [] 
        for el in self.par: 
            tokenized = nk.word_tokenize(el) 
            total_tokens = total_tokens + tokenized 
        return total_tokens

    def word_avg(self):
        total_length = 0 
        char_length = 0 
        tokens = self.token_collection()
        for el in tokens: 
            char_length = float(char_length + len(el)) 
            total_length = total_length + 1 
        average_length = float(char_length / total_length) 
        self.dictionary["avg_words"].append(average_length)
        return average_length 

    def hapax_collection(self):
        frequency = 1 
        tokens = self.token_collection()
        word_hapax = []
        vocabular = set(tokens)
        for el in vocabular:
            token_frequency = tokens.count(el)
            if token_frequency == frequency: 
                word_hapax.append(el) 
        self.dictionary["hapax"].append(len(word_hapax))
        return word_hapax

    def incremental_frequency(self):
        incremental = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
        el = 0
        hapax = self.hapax_collection()
        while el < len(incremental):
            incremental_frequency = len(hapax) * 1.0 / incremental[el] 
            print("\tWith a corpus of length {}, the distribution of hapax is {}\r".format(incremental[el], incremental_frequency)) 
            el = el + 1 
        return "\tDistribution increment complete."

    def ratio_noun_verbs(self):
        tokens = self.token_collection()
        part_of_speech = nk.pos_tag(tokens)
        noun_count = 0
        verb_count = 0
        for tag in part_of_speech:
            if re.search(r'(NN|NNP|NNPS|NNS)', tag[1]):
                noun_count = noun_count + 1
            elif re.search(r'(VBD|VBG|VBN|VBP|VBZ)', tag[1]):
                verb_count = verb_count + 1
        ratio = float(noun_count) / verb_count
        self.dictionary["ratio"].append(ratio)
        return ratio

    def part_of_speech_collection(self):
        tokens = self.token_collection()
        part_of_speech = nk.pos_tag(tokens)
        list_feedback = []
        for el in part_of_speech:
            list_feedback.append(el)
        calculation_frequency = nk.FreqDist(list_feedback)
        frequency_feedback = calculation_frequency.most_common(10)
        self.dictionary["bigrams"].append(frequency_feedback)
        return frequency_feedback

    def conditional_bigrams(self):
        tokens = self.token_collection()
        bigrams_tokens = list(bigrams(tokens))
        iterable_bigrams = set(bigrams_tokens)
        list_feedback = []
        for el in iterable_bigrams:
            bigram_frequency = bigrams_tokens.count(el)
            first_element_frequency = tokens.count(el[0])
            conditional_probability = float(bigram_frequency / first_element_frequency)
            list_feedback.append((el, conditional_probability))
        tidy_list = sorted(list_feedback, key=lambda a: -a[1], reverse=False)
        self.dictionary["prob_cond"].append(tidy_list[:10])
        return tidy_list[:10]

    def lmi(self):
        tokens = self.token_collection()
        bigrams_tokens = list(bigrams(tokens))
        iterable_bigrams = set(bigrams_tokens)
        n = len(tokens)
        list_feedback = []
        for el in iterable_bigrams:
            bigram_frequency = bigrams_tokens.count(el)
            first_element_frequency = tokens.count(el[0])
            second_element_frequency = tokens.count(el[1])
            numerator = float(bigram_frequency * n)
            denominator = float(first_element_frequency * second_element_frequency)
            mutual_information = math.log(numerator / denominator, 2.0)
            local_mutual_information = bigram_frequency * mutual_information
            list_feedback.append((el, local_mutual_information))
        tidy_list = sorted(list_feedback, key=lambda a: -a[1], reverse=False)
        self.dictionary["LMI"].append(tidy_list[:10])
        return tidy_list[:10]

In [3]:
def main(first_par, second_par):
    input_1 = cds.open(first_par, "r", "utf-8")
    input_2 = cds.open(second_par, "r", "utf-8")
    raw_1 = input_1.read()
    raw_2 = input_2.read()
    splitter = nk.data.load('tokenizers/punkt/english.pickle')
    phrase_1 = splitter.tokenize(raw_1)
    phrase_2 = splitter.tokenize(raw_2)

    analyzer_1 = TextAnalyzer(phrase_1)
    analyzer_2 = TextAnalyzer(phrase_2)

    print("--"*60)
    print("Text: {}".format(first_par))
    print("--"*60)
    analyzer_1.calculate_token()
    print("--Number of Tokens equal to {}".format(analyzer_1.dictionary["tokens"][0]))
    analyzer_1.quantity_sentences()
    print("--Number of sentences equal to {}".format(analyzer_1.dictionary["phrases"][0]))
    analyzer_1.sentences_avg()
    print("--The average sentence length is {}".format(analyzer_1.dictionary["avg_phrases"][0])) 
    analyzer_1.word_avg()
    print("--The average word length is {}".format(analyzer_1.dictionary["avg_words"][0])) 
    analyzer_1.hapax_collection()
    print("--The number of hapax legomena is {}".format(analyzer_1.dictionary["hapax"][0])) 
    print("{}".format(analyzer_1.incremental_frequency()))
    analyzer_1.ratio_noun_verbs()
    print("--The noun-to-verb ratio is {}".format(analyzer_1.dictionary["ratio"][0]))
    analyzer_1.part_of_speech_collection()
    print("--List of the 10 most bigrams:\n {}".format(analyzer_1.dictionary["bigrams"]))
    analyzer_1.conditional_bigrams()
    print("--List of 10 bigrams with Conditional Probability:\n {}".format(analyzer_1.dictionary["prob_cond"]))
    analyzer_1.lmi()
    print("--List of 10 bigrams with associative strength in terms of LMI:\n {}".format(analyzer_1.dictionary["LMI"]))
    
    

    print("--"*60)
    print("Text: {}".format(second_par))
    print("--"*60)
    analyzer_2.calculate_token()
    print("--Number of Tokens equal to {}".format(analyzer_2.dictionary["tokens"][0]))
    analyzer_2.quantity_sentences()
    print("--Number of sentences equal to {}".format(analyzer_2.dictionary["phrases"][0]))
    analyzer_2.sentences_avg()
    print("--The average sentence length is {}".format(analyzer_2.dictionary["avg_phrases"][0])) 
    analyzer_2.word_avg()
    print("--The average word length is {}".format(analyzer_2.dictionary["avg_words"][0])) 
    analyzer_2.hapax_collection()
    print("--The number of hapax legomena is {}".format(analyzer_2.dictionary["hapax"][0])) 
    print("{}".format(analyzer_2.incremental_frequency()))
    analyzer_2.ratio_noun_verbs()
    print("--The noun-to-verb ratio is {}".format(analyzer_2.dictionary["ratio"][0]))
    analyzer_2.part_of_speech_collection()
    print("--List of the 10 most bigrams:\n {}".format(analyzer_2.dictionary["bigrams"]))
    analyzer_2.conditional_bigrams()
    print("--List of 10 bigrams with Conditional Probability:\n {}".format(analyzer_2.dictionary["prob_cond"]))
    analyzer_2.lmi()
    print("--List of 10 bigrams with associative strength in terms of LMI:\n {}".format(analyzer_2.dictionary["LMI"]))
    


In [4]:
if __name__ == "__main__":
    first_path = '../text/dracula-UTF8.txt'
    second_path = '../text/hydeandjack-UTF8.txt'
    main(first_path, second_path)

------------------------------------------------------------------------------------------------------------------------
Text: ../text/dracula-UTF8.txt
------------------------------------------------------------------------------------------------------------------------
--Number of Tokens equal to 189228
--Number of sentences equal to 8428
--The average sentence length is 99.87446606549597
--The average word length is 3.571929101401484
--The number of hapax legomena is 5164
	With a corpus of length 1000, the distribution of hapax is 5.164
	With a corpus of length 2000, the distribution of hapax is 2.582
	With a corpus of length 3000, the distribution of hapax is 1.7213333333333334
	With a corpus of length 4000, the distribution of hapax is 1.291
	With a corpus of length 5000, the distribution of hapax is 1.0328
	With a corpus of length 6000, the distribution of hapax is 0.8606666666666667
	With a corpus of length 7000, the distribution of hapax is 0.7377142857142858
	With a corpus of