In [18]:
from math import log2

class NgramAnalyzer:
    
    gram_dict = {}    
    total_unigrams = 0
    total_bigrams = 0
    total_trigrams = 0
        
    def __init__(self, file_url):
        self.file_url = file_url
    
    def ngram_creator(self, wordlist,n):            
            ngrams = []
            for i in range(len(wordlist)):        
                ngram = wordlist[i:i+n]
                if len(ngram) == n:
                    ngrams.append(ngram)
            return ngrams
    
    def run(self):        
        wordlist = []
        with open(self.file_url) as myfile: 
            # reading line by line avoid out-of-memory issues when working with large files.
            for line in myfile:
                #print("-----------------",line)

                # capturing last words for get n-grams between lines
                try: last_word = wordlist[-1:] 
                except: last_word = []
                try: last2_words = wordlist[-2:] 
                except: last2_word = []        

                # collecting ngrams by calling n_gram_creator
                wordlist = line.split()          
                unigrams = self.ngram_creator(wordlist, 1)
                bigrams = self.ngram_creator(last_word + wordlist, 2)
                trigrams = self.ngram_creator(last2_words + wordlist, 3)
                ngrams = unigrams + bigrams + trigrams                
                
                self.total_unigrams += len(unigrams)
                self.total_bigrams += len(bigrams)
                self.total_trigrams += len(trigrams)

                # save/storing at dictionary
                for gram in ngrams:            
                    gram = " ".join(gram)
                    try:                
                        self.gram_dict[gram] += 1
                        #print(word, gram_dict[word])
                    except:
                        self.gram_dict[gram] = 1
                        #print("---",word)

        
    def run_pmi(self,ocurrences_threshold, score_threshold=0):        
        # ocurrences_threshold: minimum number of occurrences a n-gram should have in each of its word to be considered
        # score_threshold: minimum pm_score a n-gram should have to be considered
        
        # controlling whether ngram information is available
        try:
            assert len(self.gram_dict)>0, "it is empty, you first should to call run method"
        except:
            print ("******* internally it called run method")
            self.run()
        
        bigrams_scores = {}
        trigrams_scores = {}
        for gram in self.gram_dict:
            
            # filter out unigrams and ngrams do not overcome the ocurrences_threshold
            gram_list = gram.split()
            if len(gram_list)<2: continue
            if not all([self.gram_dict[word]>ocurrences_threshold for word in gram_list]): continue                        
            
            # compute probabilities
            total_ngrams= self.total_bigrams if len(gram_list)==2 else self.total_trigrams
            gram_count = self.gram_dict[gram]/total_ngrams
            gram_list_counts = [self.gram_dict[word]/self.total_unigrams for word in gram_list]
            
            # compute pmi score e.g. pmi_sc = P(w1,w2,w3)/P(w1)P(w2)P(w3)
            mult_ind_words= 1
            for word_count in gram_list_counts:
                mult_ind_words = mult_ind_words*word_count            
            coocurrence_score=log2(gram_count/mult_ind_words)
            
            # filter n-grams based on pmi_score
            if coocurrence_score < score_threshold: continue
            
            # reporting results in different dictionaries
            if len(gram_list)==2: bigrams_scores[gram] = coocurrence_score
            if len(gram_list)==3: trigrams_scores[gram] = coocurrence_score
        return bigrams_scores, trigrams_scores                

In [19]:
#################### Main

# Read file, extract ngrams (unigrams, bigrams and trigrams) and save them in a dictionary variable of an object
url = "C:/my_temp_files/peter_norvig_file.txt"
my_ngram_analyzer = NgramAnalyzer(url)
my_ngram_analyzer.run()

# demostration: dictionary of ngrams frequencies
dictionary = my_ngram_analyzer.gram_dict

print("Demostration: dictionary of ngrams frequencies")

for i, gram in enumerate(dictionary):    
    print(gram, dictionary[gram])
    if i == 10: break
print ("... \nA total of", len(dictionary), "different ngrams where found.")

Demostration: dictionary of ngrams frequencies
The 6149
Project 205
Gutenberg 78
EBook 5
of 39169
Adventures 2
Sherlock 95
Holmes 198
The Project 13
Project Gutenberg 74
Gutenberg EBook 5
... 
A total of 1386757 different ngrams where found.


In [20]:
# extract ngrams (bigrams and trigrams) based on occurences_threshold and pmi_score_threshold
occurences_threshold = 100
score_threshold = 5
bigram_pmi_scores, trigram_pmi_scores = my_ngram_analyzer.run_pmi(occurences_threshold, score_threshold)


# demonstrarion: bigram and trigrams pmi_scores
from pandas import DataFrame as df

def gram_dataframe(ngram_dictionary, pmi_threshold = 0):
    ngrams_keys = list(ngram_dictionary.keys())
    ngrams_values = list(ngram_dictionary.values())
    data = {"ngram":ngrams_keys, "pmi_score": ngrams_values}
    sorted_df = df.from_dict(data).sort_values(by='pmi_score', ascending=False)
    return sorted_df

print("demonstration: bigrams and trigrams pmi_scores. Showing for threshold = 100")
print(gram_dataframe(bigram_pmi_scores))
print(gram_dataframe(trigram_pmi_scores))


demonstration: bigrams and trigrams pmi_scores. Showing for threshold = 100
                     ngram  pmi_score
1496  Project Gutenberg-tm  12.412818
2792   [Illustration: FIG.  11.980397
1504              New York  11.195880
815          United States  11.008437
3748         Princess Mary  10.598099
4238        Princess Mary,  10.484811
7                   OF THE  10.212413
2916           vessel wall  10.181655
2740           takes place  10.133388
36                "My dear  10.047668
2932         lymph vessels  10.020624
347            Mr. Holmes,  10.007705
2884          lymph glands   9.943219
4925      presented itself   9.879802
3766         drawing room,   9.861783
3051            soft parts   9.849331
2946    treatment consists   9.825658
3760         drawing room.   9.823951
1737           paper money   9.784738
2901          takes place,   9.781307
2958          cold abscess   9.766711
1110         several times   9.762912
3203      observed chiefly   9.750122
1993  federa

In [21]:
analysis = """
Analysis:
- A min_ocurrence_threshold = 100 seems to work pretty good for extract collocation of bigrams.
- However at extracting collocation of trigrams a min_ocurrence_threshold = 360 leads to better results.
- Posterior data cleaning like (case converter, remove punctuation, POS filtering, etc) can increase the gap for discriminate collocation and keywords.
"""
print(analysis)


Analysis:
- A min_ocurrence_threshold = 100 seems to work pretty good for extract collocation of bigrams.
- However at extracting collocation of trigrams a min_ocurrence_threshold = 360 leads to better results.
- Posterior data cleaning like (case converter, remove punctuation, POS filtering, etc) can increase the gap for discriminate collocation and keywords.

