In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re 
import codecs as cds 
import nltk as nk 
import math 
from nltk import FreqDist, ne_chunk

In [2]:
class TextAnalyzer:
    def __init__(self, param):
        self.param = param
        self.dictionary = {
            'NNP': [],
            'phrases_NNP': [],
            'phrases_length': [],
            'GPE': [],
            'person': [],
            'nouns': [],
            'verbs': [],
            'dates': [],
            'markov_order': []
        }


    def token_collection(self):
        try:
            total_tokens = [token for p in self.param for token in nk.word_tokenize(p)]
            return total_tokens
        except Exception as e:
            raise Exception(f'Error: {e}')

    def noun_search(self):
        try:
            noun_list = []
            for p in self.param:
                tokens = nk.word_tokenize(p)
                pos = nk.pos_tag(tokens)
                tree_pos = nk.ne_chunk(pos, binary=False)
                
                # Estrae solo le entità PERSON e le filtra per nomi validi
                name_phrases = [' '.join(noun[0] for noun in node.leaves()) for node in tree_pos if isinstance(node, nk.Tree) and node.label() == 'PERSON']
                name_phrases = [name for name in name_phrases if name[0].isalpha() and name[0].isupper()]
                
                
                noun_list.extend(name_phrases)
            
            total_noun = FreqDist(noun_list).most_common(10)
            self.dictionary['NNP'].append(total_noun)
            return total_noun
        except Exception as e:
            raise Exception(f'Error: {e}')

        
    def phrase_searcher(self):
        try:
            top_nouns = [noun[0] for noun in self.noun_search()[:10]]
            noun_set = set(top_nouns)
            phrase_list = [p for p in self.param if any(noun in p for noun in noun_set)]
            
            self.dictionary['phrases_NNP'].append(phrase_list)
            return phrase_list
        except Exception as e:
            raise Exception(f'Error: {e}')


        
    def noun_set(self):
        try:
            noun_list = self.noun_search()
            only_noun = set([noun[0] for noun in noun_list])
            return only_noun
        except Exception as e:
            raise Exception(f'Error: {e}')
        
    def phrase_length(self):
        try:
            noun_list = self.noun_search()
            phrase_list = self.phrase_searcher()

            temp_dict = {}

            for noun in noun_list:
                list_of_phrases = [phrase for phrase in phrase_list if noun[0] in phrase]
                list_of_phrases.sort(key=len)
                if list_of_phrases:
                    shortest = 'La frase più corta che contiene il nome ' + noun[0] + ' è: ' + list_of_phrases[0]
                    longest = 'La frase più lunga che contiene il nome ' + noun[0] + ' è: ' + list_of_phrases[-1]
                    temp_dict[noun[0]] = shortest, longest

            self.dictionary['phrases_length'].append(temp_dict)
            return temp_dict
        except Exception as e:
            raise Exception(f'Error: {e}')
        

    def geo_entity_search(self):
        try:
            phrase_list = self.phrase_searcher()
            noun_set = self.noun_set()

            gpe_list = []
            for phrase in phrase_list:
                tokens = nk.word_tokenize(phrase)
                pos = nk.pos_tag(tokens)
                tree = nk.ne_chunk(pos)
                gpe_phrases = [' '.join(leaf[0] for leaf in node.leaves()) for node in tree if isinstance(node, nk.Tree) and node.label() == 'GPE']
                gpe_phrases = [entity for entity in gpe_phrases if entity[0].isalpha() and entity[0].isupper()]
                
                gpe_list.extend([gpe for gpe in gpe_phrases if gpe not in noun_set])

            total_gpe = FreqDist(gpe_list).most_common(10)
            self.dictionary['GPE'].append(total_gpe)
            return total_gpe
        except Exception as e:
            raise Exception(f'Error: {e}')

    def proper_name_searcher(self):
        try:
            phrase_list = self.phrase_searcher()
            proper_list = []
            for phrase in phrase_list:
                tokens = nk.word_tokenize(phrase)
                pos = nk.pos_tag(tokens)
                tree = nk.ne_chunk(pos)
                name_phrases = [' '.join(leaf[0] for leaf in node.leaves()) for node in tree if isinstance(node, nk.Tree) and node.label() == 'PERSON']
                proper_list.extend(name_phrases)
            
            proper_name_list = FreqDist(proper_list).most_common(10)
            self.dictionary['person'].append(proper_name_list)
            return proper_name_list
        except Exception as e:
            raise Exception(f'Error: {e}')
        
    def noun_ratio(self):
        try:
            phrase_list = self.phrase_searcher()
            substantive_list = [(word, tag) for phrase in phrase_list for (word, tag) in nk.pos_tag(nk.word_tokenize(phrase)) if re.search(r'(NN|NNS|NNP|NNPS)', tag)]

            total_substantive = FreqDist(substantive_list).most_common(10)
            self.dictionary['nouns'].append(total_substantive)
            return total_substantive
        except Exception as e:
            raise Exception(f'Error: {e}')
        
    def verbs_ratio(self):
        try:
            phrase_list = self.phrase_searcher()
            verbs_list = [(word, tag) for phrase in phrase_list for (word, tag) in nk.pos_tag(nk.word_tokenize(phrase)) if re.search(r'(VBD|VBG|VBN|VBP|VBZ)', tag)]

            total_verbs = FreqDist(verbs_list).most_common(10)
            self.dictionary['verbs'].append(total_verbs)
            return total_verbs
        except Exception as e:
            raise Exception(f'Error: {e}')    
        

    def data_searcher(self):
        try:
            phrase_list = self.phrase_searcher()
            date_list = []
            
            pattern = r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|january|february|march|april|may|june|july|august|september|october|november|december|monday|tuesday|wednesday|thursday|friday|saturday|sunday)\b'

            for phrase in phrase_list:
                results = re.findall(pattern, phrase, re.IGNORECASE)
                date_list.extend([date.lower() for date in results]) 
                
            frequency_dist = FreqDist(date_list)
            frequency_dist = [(date, freq) for date, freq in frequency_dist.items()]
            frequency_dist = sorted(frequency_dist, key=lambda x: x[1], reverse=True)

            self.dictionary['dates'].append(frequency_dist)
            
            return frequency_dist
        except Exception as e:
            raise Exception(f'Error: {e}')

    def markov_order_zero(self):
        try:
            total_tokens = self.token_collection()
            freq_distribution = FreqDist(total_tokens)
            phrase_list = self.phrase_searcher()  # Assumendo che questa lista sia già filtrata per i noun più frequenti
            max_prob = 0
            max_prob_phrase = None
            final_sentence = []
            for phrase in phrase_list:
                tokens = nk.word_tokenize(phrase)
                if 8 <= len(tokens) <= 12:
                    prob = 1
                    for token in tokens:
                        ratio = freq_distribution[token] / len(total_tokens)
                        prob *= ratio
                    if prob > max_prob:
                        max_prob = prob
                        max_prob_phrase = phrase

            final_sentence.append((max_prob_phrase, max_prob))
            self.dictionary['markov_order'].append(final_sentence)
    
            return final_sentence
        except Exception as e:
            raise Exception(f'Error: {e}')



In [3]:
def main(first_par, second_par, output_file):
    input_1 = cds.open(first_par, 'r', 'utf-8')
    input_2 = cds.open(second_par, 'r', 'utf-8')
    raw_1 = input_1.read()
    raw_2 = input_2.read()
    splitter = nk.data.load('tokenizers/punkt/english.pickle')
    phrase_1 = splitter.tokenize(raw_1)
    phrase_2 = splitter.tokenize(raw_2)

    analyzer_1 = TextAnalyzer(phrase_1)
    analyzer_2 = TextAnalyzer(phrase_2)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('--'*40 + '\n')
        f.write(f'Text: {first_par}\n')
        f.write('--'*40 + '\n')

        # Most frequent proper nouns
        analyzer_1.noun_search()
        f.write('--Top 10 most frequent proper nouns in the text:\n')
        for noun, freq in analyzer_1.dictionary['NNP'][0]:
            f.write(f'\t{noun} with frequency {freq}\n')

        # Longest and shortest phrases for each proper noun
        analyzer_1.phrase_length()
        f.write('--For each proper noun, I provide the longest and shortest phrases:\n')
        for noun, phrases in analyzer_1.dictionary['phrases_length'][0].items():
            f.write(f'\tFor \033[1m{noun}\033[0m:\n')
            f.write(f'\t{phrases[0]}\n\t{phrases[1]}\n')

        # Top 10 GPE
        analyzer_1.geo_entity_search()
        f.write('--For each sentence containing the most frequent nouns, I provide the 10 most frequent GPE:\n')
        for gpe in analyzer_1.dictionary['GPE'][0]:
            f.write(f'\t{gpe[0]} with frequency {gpe[1]}\n')

        # Top 10 proper names
        analyzer_1.proper_name_searcher()
        f.write('--For each sentence containing the most frequent nouns, I provide the 10 most frequent proper names:\n')
        for propn in analyzer_1.dictionary['person'][0]:
            f.write(f'\t{propn[0]} with frequency {propn[1]}\n')

        # Top 10 nouns
        analyzer_1.noun_ratio()
        f.write('--For each sentence containing the most frequent nouns, I provide the 10 most frequent nouns:\n')
        for substantive in analyzer_1.dictionary['nouns'][0]:
            f.write(f'\t{substantive[0][0]} with tag {substantive[0][1]} with frequency {substantive[1]}\n') 

        # Top 10 verbs
        analyzer_1.verbs_ratio()
        f.write('--For each sentence containing the most frequent nouns, I provide the 10 most frequent verbs:\n')
        for verb in analyzer_1.dictionary['verbs'][0]:
            f.write(f'\t{verb[0][0]} with tag {verb[0][1]} with frequency {verb[1]}\n') 

        # Dates, months, days
        analyzer_1.data_searcher()
        f.write('--For each sentence containing the most frequent nouns, I provide the dates, months, and days:\n')
        for data, frequency in analyzer_1.dictionary['dates'][0]:
            f.write(f'\t{data} with frequency {frequency}\n')

        # Markov model of order 0
        analyzer_1.markov_order_zero()
        f.write('--The phrase containing the most frequent nouns, at least 8 tokens long and at most 12 tokens long, with the highest probability is:\n')
        for phrase_markov, prob_markov in analyzer_1.dictionary['markov_order'][0]:
            f.write(f'\t{phrase_markov} with probability {prob_markov}\n')

        f.write('\n\n\n')

        # Analyze the second text
        f.write('--'*40 + '\n')
        f.write(f'Text: {second_par}\n')
        f.write('--'*40 + '\n')

        # Most frequent proper nouns
        analyzer_2.noun_search()
        f.write('--Top 10 most frequent proper nouns in the text:\n')
        for noun, freq in analyzer_2.dictionary['NNP'][0]:
            f.write(f'\t{noun} with frequency {freq}\n')

        # Longest and shortest phrases for each proper noun
        analyzer_2.phrase_length()
        f.write('--For each proper noun, I provide the longest and shortest phrases:\n')
        for noun, phrases in analyzer_2.dictionary['phrases_length'][0].items():
            f.write(f'\tFor \033[1m{noun}\033[0m:\n')
            f.write(f'\t{phrases[0]}\n\t{phrases[1]}\n')

        # Top 10 GPE
        analyzer_2.geo_entity_search()
        f.write('--For each sentence containing the most frequent nouns, I provide the 10 most frequent GPE:\n')
        for gpe in analyzer_2.dictionary['GPE'][0]:
            f.write(f'\t{gpe[0]} with frequency {gpe[1]}\n')

        # Top 10 proper names
        analyzer_2.proper_name_searcher()
        f.write('--For each sentence containing the most frequent nouns, I provide the 10 most frequent proper names:\n')
        for propn in analyzer_2.dictionary['person'][0]:
            f.write(f'\t{propn[0]} with frequency {propn[1]}\n')

        # Top 10 nouns
        analyzer_2.noun_ratio()
        f.write('--For each sentence containing the most frequent nouns, I provide the 10 most frequent nouns:\n')
        for substantive in analyzer_2.dictionary['nouns'][0]:
            f.write(f'\t{substantive[0][0]} with tag {substantive[0][1]} with frequency {substantive[1]}\n') 

        # Top 10 verbs
        analyzer_2.verbs_ratio()
        f.write('--For each sentence containing the most frequent nouns, I provide the 10 most frequent verbs:\n')
        for verb in analyzer_2.dictionary['verbs'][0]:
            f.write(f'\t{verb[0][0]} with tag {verb[0][1]} with frequency {verb[1]}\n') 

        # # Dates, months, days
        analyzer_2.data_searcher()
        f.write('--For each sentence containing the most frequent nouns, I provide the dates, months, and days:\n')
        for data, frequency in analyzer_2.dictionary['dates'][0]:
            f.write(f'\t{data} with frequency {frequency}\n')

        # Markov model of order 0
        analyzer_2.markov_order_zero()
        f.write('--The phrase containing the most frequent nouns, at least 8 tokens long and at most 12 tokens long, with the highest probability is:\n')
        for phrase_markov, prob_markov in analyzer_2.dictionary['markov_order'][0]:
            f.write(f'\t{phrase_markov} with probability {prob_markov}\n')



In [4]:
if __name__ == '__main__':
    first_path = '../text/dracula-UTF8.txt'
    second_path = '../text/hydeandjack-UTF8.txt'
    output_file = '../output/output_program_2.out'
    main(first_path, second_path, output_file)