In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re 
import codecs as cds 
import nltk as nk 
from nltk import bigrams 
import math 

In [None]:
class TextAnalyzer:
    def __init__(self, param):
        self.param = param
        self.dictionary = {
            'phrases': [],
            'tokens': [],
            'avg_phrases': [],
            'avg_words': [],
            'hapax': [],
            'ratio': [],
            'bigrams': [],
            'prob_cond': [],
            'LMI': []
        }

    def calculate_token(self):
        try:
            total_length = sum(len(nk.word_tokenize(p)) for p in self.param)
            self.dictionary['tokens'].append(total_length)
            return total_length
        except Exception as e:
            raise Exception(f'Error: {e}')


    def quantity_sentences(self):
        try:
            counter = sum(1 for p in self.param)
            self.dictionary['phrases'].append(counter)
            return counter
        except Exception as e:
            raise Exception(f'Error: {e}')


    def sentences_avg(self):
        try:
            total_length = sum(len(p) for p in self.param)
            avg_length = total_length / len(self.param)
            self.dictionary['avg_phrases'].append(avg_length)
            return avg_length
        except Exception as e:
            raise Exception(f'Error: {e}')

    def token_collection(self):
        try:
            total_tokens = [token for p in self.param for token in nk.word_tokenize(p)]
            return total_tokens
        except Exception as e:
            raise Exception(f'Error: {e}')

    def word_avg(self):
        try:
            tokens = self.token_collection()
            total_length = sum(len(tok) for tok in tokens)
            average_length = total_length / len(tokens)
            self.dictionary['avg_words'].append(average_length)
            return average_length
        except Exception as e:
            raise Exception(f'Error: {e}')


    def hapax_collection(self):
        try:
            tokens = self.token_collection()
            word_hapax = [word for word in set(tokens) if tokens.count(word) == 1]
            self.dictionary['hapax'].append(len(word_hapax))
            return word_hapax
        except Exception as e:
            raise Exception(f'Error: {e}')


    def incremental_frequency(self):
        try:
            incremental = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
            hapax = self.hapax_collection()
            for length in incremental:
                incremental_frequency = len(hapax) / length
                print(f'\tWith a corpus of length {length}, the distribution of hapax is {incremental_frequency}\r')
            return '\tDistribution increment complete.'
        except Exception as e:
            raise Exception(f'Error: {e}')


    def ratio_noun_verbs(self):
        try:
            tokens = self.token_collection()
            part_of_speech = nk.pos_tag(tokens)
            noun_count = sum(1 for _, tag in part_of_speech if re.search(r'(NN|NNP|NNPS|NNS)', tag))
            verb_count = sum(1 for _, tag in part_of_speech if re.search(r'(VBD|VBG|VBN|VBP|VBZ)', tag))

            if verb_count == 0:
                ratio = 'N/A'
            else:
                ratio = float(noun_count) / verb_count

            self.dictionary['ratio'].append(ratio)
            return ratio
        except Exception as e:
            raise Exception(f'Error: {e}')



    def part_of_speech_collection(self):
        try:
            tokens = self.token_collection()
            part_of_speech = nk.pos_tag(tokens)
            calculation_frequency = nk.FreqDist(part_of_speech)
            frequency_feedback = calculation_frequency.most_common(10)
            self.dictionary['bigrams'].append(frequency_feedback)
            return frequency_feedback
        except Exception as e:
            raise Exception(f'Error: {e}')
        

    def conditional_bigrams(self):
        try:
            tokens = self.token_collection()
            bigrams_tokens = list(bigrams(tokens))

            bigram_freq = {}
            first_element_freq = {}
            for bigram in bigrams_tokens:
                bigram_freq[bigram] = bigram_freq.get(bigram, 0) + 1
                first_element_freq[bigram[0]] = first_element_freq.get(bigram[0], 0) + 1

            list_feedback = []
            for bigram in bigram_freq:
                prob_cond = bigram_freq[bigram] / first_element_freq[bigram[0]]
                list_feedback.append((bigram, prob_cond))

            tidy_list = sorted(list_feedback, key=lambda a: -a[1], reverse=False)[:10]

            self.dictionary['prob_cond'].append(tidy_list)
            return tidy_list

        except Exception as e:
            raise Exception(f'Error: {e}')



    def lmi(self):
        try:
            tokens = self.token_collection()
            bigrams_tokens = list(nk.bigrams(tokens))

            calculation_frequency = nk.FreqDist(bigrams_tokens)
            n = len(tokens)

            list_feedback = []
            for bigram in calculation_frequency:
                bigram_frequency = calculation_frequency[bigram]
                first_element_frequency = sum(1 for token in tokens if token == bigram[0])
                second_element_frequency = sum(1 for token in tokens if token == bigram[1])

                numerator = bigram_frequency * n
                denominator = first_element_frequency * second_element_frequency

                if denominator != 0:
                    mutual_information = math.log2(numerator / denominator)
                    local_mutual_information = bigram_frequency * mutual_information
                    list_feedback.append((bigram, local_mutual_information))

            tidy_list = sorted(list_feedback, key=lambda a: -a[1])[:10]
            self.dictionary['LMI'].append(tidy_list)
            return tidy_list

        except Exception as e:
            raise Exception(f'Error: {e}')


In [None]:
def main(first_par, second_par, output_file):
    # Open and read the input files
    input_1 = cds.open(first_par, 'r', 'utf-8')
    input_2 = cds.open(second_par, 'r', 'utf-8')
    raw_1 = input_1.read()
    raw_2 = input_2.read()

    # Tokenize the text into sentences using NLTK's Punkt tokenizer
    splitter = nk.data.load('tokenizers/punkt/english.pickle')
    phrase_1 = splitter.tokenize(raw_1)
    phrase_2 = splitter.tokenize(raw_2)

    # Create instances of TextAnalyzer to analyze the text
    analyzer_1 = TextAnalyzer(phrase_1)
    analyzer_2 = TextAnalyzer(phrase_2)

    # Write the output to the specified file
    with open(output_file, 'w', encoding='utf-8') as f:
        # Analyze the first text
        f.write('--'*40 + '\n')
        f.write(f'Text: {first_par}\n')
        f.write('--'*40 + '\n')

        # Calculate the total number of tokens in the text
        analyzer_1.calculate_token()
        f.write(f'--Number of Tokens equal to {analyzer_1.dictionary["tokens"][0]}\n')

        # Calculate the total number of sentences in the text
        analyzer_1.quantity_sentences()
        f.write(f'--Number of sentences equal to {analyzer_1.dictionary["phrases"][0]}\n')
        
        # Calculate the average sentence length in the text
        analyzer_1.sentences_avg()
        f.write(f'--The average sentence length is {analyzer_1.dictionary["avg_phrases"][0]}\n') 

        # Calculate the average word length in the text
        analyzer_1.word_avg()
        f.write(f'--The average word length is {analyzer_1.dictionary["avg_words"][0]}\n') 

        # Calculate the number of hapax legomena in the text
        analyzer_1.hapax_collection()
        f.write(f'--The number of hapax legomena is {analyzer_1.dictionary["hapax"][0]}\n')

        # Calculate the incremental frequency of tokens in the text
        f.write(f'{analyzer_1.incremental_frequency()}\n')

        # Calculate the noun-to-verb ratio in the text
        analyzer_1.ratio_noun_verbs()
        f.write(f'--The noun-to-verb ratio is {analyzer_1.dictionary["ratio"][0]}\n')
        
        # Find and print the 10 most frequent bigrams in the text
        analyzer_1.part_of_speech_collection()
        f.write('--List of the 10 most bigrams:\n')
        for bigrams, frequency in analyzer_1.dictionary["bigrams"][0]:
            f.write(f'{bigrams} with frequency {frequency}\n')

        # Calculate and print 10 bigrams with Conditional Probability in the text
        analyzer_1.conditional_bigrams()
        f.write('--List of 10 bigrams with Conditional Probability:\n')
        for bigrams, prob_cond in analyzer_1.dictionary["prob_cond"][0]:
            f.write(f'{bigrams} with conditional probability {prob_cond}\n')

        # Calculate and print 10 bigrams with associative strength in terms of LMI in the text
        analyzer_1.lmi()
        f.write('--List of 10 bigrams with associative strength in terms of LMI:\n')
        for bigrams in analyzer_1.dictionary["LMI"][0]:
            f.write(f'{bigrams[0]} with associative strength {bigrams[1]}\n')

        f.write('\n\n\n')
        
        # Analyze the second text
        f.write('--'*40 + '\n')
        f.write(f'Text: {second_par}\n')
        f.write('--'*40 + '\n')

        # Calculate the total number of tokens in the text
        analyzer_2.calculate_token()
        f.write(f'--Number of Tokens equal to {analyzer_2.dictionary["tokens"][0]}\n')

        # Calculate the total number of sentences in the text
        analyzer_2.quantity_sentences()
        f.write(f'--Number of sentences equal to {analyzer_2.dictionary["phrases"][0]}\n')
        
        # Calculate the average sentence length in the text
        analyzer_2.sentences_avg()
        f.write(f'--The average sentence length is {analyzer_2.dictionary["avg_phrases"][0]}\n') 

        # Calculate the average word length in the text
        analyzer_2.word_avg()
        f.write(f'--The average word length is {analyzer_2.dictionary["avg_words"][0]}\n') 

        # Calculate the number of hapax legomena in the text
        analyzer_2.hapax_collection()
        f.write(f'--The number of hapax legomena is {analyzer_2.dictionary["hapax"][0]}\n')

        # Calculate the incremental frequency of tokens in the text
        f.write(f'{analyzer_2.incremental_frequency()}\n')

        # Calculate the noun-to-verb ratio in the text
        analyzer_2.ratio_noun_verbs()
        f.write(f'--The noun-to-verb ratio is {analyzer_2.dictionary["ratio"][0]}\n')
        
        # Find and print the 10 most frequent bigrams in the text
        analyzer_2.part_of_speech_collection()
        f.write('--List of the 10 most bigrams:\n')
        for bigrams, frequency in analyzer_2.dictionary["bigrams"][0]:
            f.write(f'{bigrams} with frequency {frequency}\n')

        # Calculate and print 10 bigrams with Conditional Probability in the text
        analyzer_2.conditional_bigrams()
        f.write('--List of 10 bigrams with Conditional Probability:\n')
        for bigrams, prob_cond in analyzer_2.dictionary["prob_cond"][0]:
            f.write(f'{bigrams} with conditional probability {prob_cond}\n')

        # Calculate and print 10 bigrams with associative strength in terms of LMI in the text
        analyzer_2.lmi()
        f.write('--List of 10 bigrams with associative strength in terms of LMI:\n')
        for bigrams in analyzer_2.dictionary["LMI"][0]:
            f.write(f'{bigrams[0]} with associative strength {bigrams[1]}\n')


In [None]:
if __name__ == '__main__':
    first_path = '../text/dracula-UTF8.txt'
    second_path = '../text/hydeandjack-UTF8.txt'
    output_file = '../output/output_program_1.out'
    main(first_path, second_path, output_file)