# Installations required

In [2]:
#*********REQUIRED PACKAGES**********

#!pip install spacy

#!python -m spacy download en_core_web_sm
#or
#conda install -c conda-forge spacy-model-en_core_web_sm

#!pip install gensim

# Import libraries

In [3]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import spacy
nlp = spacy.load('en_core_web_sm')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

import re
import gensim 
import logging
import pickle
import math

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.floa

# Collocations

In [4]:
class Collocations():
    ''' this class contains methods to extract collocations from the corpus'''
    
    #GENSIM TOKENIZATION (SIMPLE)
    def nltk_sentence_tokenizer(self, x):
        ''' returns sentence tokenization for a given paragraph'''
        return sent_tokenize(x)

    def getPOStags(self, sent):
        ''' returns the POS tags for all the words in a given sentence'''
        pos_tags = []
        for token in sent:
            pos_tags.append(token.pos_)
        return pos_tags

    def read_input_token(self, corpus):
        ''' reads each review and replaces any characters other than alphabets with a blank space'''
        result =[]
        for i, para in enumerate(corpus):
            if i % 10 == 0:
                print("Gensim: Processing para = ", i)
            sentences = self.nltk_sentence_tokenizer(para)
            for sent in sentences:
                sent_clean = ''
                for token in nlp(sent):
                    str = re.sub(r"[^a-zA-Z]+", ' ', token.text)
                    sent_clean = sent_clean + ' ' + str
                temp = gensim.utils.simple_preprocess(sent_clean)
                #yield gensim.utils.simple_preprocess(sent_clean) 
                '''Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.'''
            result.append(temp)
        return result
    
    # RETOKENIZATION USING SPACY
    def processNounPhrases(self, noun_phrases):
        ''' removes determinants from the sentence '''
        processed_NounPhrases = []
        for i,noun_phrase in enumerate(noun_phrases):
            processed_nounPhrase = ''
            sent = nlp(noun_phrase)
            for token in sent:
                #Remove "the", "a", "an" etc
                if token.pos_ != 'DET':
                    if token.pos_ == 'NUM':
                        processed_nounPhrase = processed_nounPhrase + ' ' + token.text
                    elif not token.text.isspace():
                        processed_nounPhrase = processed_nounPhrase + ' ' + token.text
            processed_NounPhrases.append(processed_nounPhrase.lstrip())
        return processed_NounPhrases 

    def get_tokens_retokenization(self, sent):
        ''' return the ngram tokenization for each sentence '''
        sentence = sent.lower()
        # Remove extra characters
        sent_clean = ''
        for token in nlp(sentence):
            str = re.sub(r"[^a-zA-Z]+", ' ', token.text)
            sent_clean = sent_clean + ' ' + str
        sentence_doc = nlp(sent_clean)

        spans = list(sentence_doc.ents) + list(sentence_doc.noun_chunks)  # collect nodes
        spans = spacy.util.filter_spans(spans) # remove duplicates

        with sentence_doc.retokenize() as retokenizer:
            [retokenizer.merge(span) for span in spans]

        tokens_spacy_retokenize = []
        for token in sentence_doc:
            tokens_spacy_retokenize.append(token.text)

        tokens_spacy_retokenize_process = self.processNounPhrases(tokens_spacy_retokenize)
        return tokens_spacy_retokenize_process

    def read_input_retokenization(self, corpus):
        ''' return ngram tokens for the corpus'''
        
        result = []
        for i, para in enumerate(corpus):
            if i % 10 == 0:
                print("Retokenization: Processing para = ", i)
            sentences = self.nltk_sentence_tokenizer(para) #sentence tokenization
            for sent in sentences:
                #yield get_tokens_retokenization(sent)
                temp = self.get_tokens_retokenization(sent)
            result.append(temp)
        return result
    
    def find_ngrams_dict(self, ngrams):
        '''returns a dict containing the ngrams (other than unigrams) of each document'''
        ngrams_dict = {}
        for i in range(len(ngrams)):
            temp = [i for i in ngrams[i] if len(i.split())>1]
            ngrams_dict[i] = temp
        return ngrams_dict

    def replace_ngrams(self, reviews,ngrams_dict):
        ''' replace the original text in the corpus with the ngrams joined by '_' '''
        
        new_reviews = []
        for i in range(len(reviews)):
            temp = reviews[i].lower()
            for j in ngrams_dict[i]:
                if j in temp:
                    new_str = j.replace(' ','_')
                    temp = temp.replace(j,new_str)
            new_reviews.append(temp)
        return new_reviews

    def make_new_csv(self, new_reviews, filename, sentiment_class):
        ''' creates a .csv file with the ngrams replaced '''
        
        new_reviews_df = pd.DataFrame(new_reviews)
        new_reviews_df.columns=['text']
        sentiment_class_df = pd.DataFrame(sentiment_class)
        sentiment_class_df.columns=['sentiment_class']
        #sentiment_class_df = pd.read_csv('IMDBtrain.csv',encoding='latin')['sentiment_class'][:200]
        new_reviews_df = pd.concat([new_reviews_df,sentiment_class_df],axis=1)

        new_reviews_df.to_csv(filename,index=False,header=True)

        return new_reviews_df

# Point wise Mutual Information (PMI)

In [5]:
class PMI():
    ''' this class contains methods to compute PMI scores for each ngram in the corpus '''
    
    def find_all_ngrams_from_corpus(self, reviews_list):
        ngrams_list = []
        for review in reviews_list:
            for word in word_tokenize(review):
                if '_' in word:
                    ngrams_list.append(word)
        ngrams_list = list(set(ngrams_list))
        return ngrams_list

    def get_frequency_of_all_ngrams(self, ngrams_list, reviews_list):
        freq_dict_ngram = {}
        i=0
        for ngram in ngrams_list:
            #print(i)
            count=0
            for review in reviews_list:
                count+=review.count(ngram)
            freq_dict_ngram[ngram] = count
            i+=1
        return freq_dict_ngram

    def get_frequency_of_all_corpus_words(self, raw_data):
        freq_dict_word = {}

        for review in raw_data:
            tokens = word_tokenize(review)
            for token in tokens:
                if token in freq_dict_word:
                    freq_dict_word[token]+=1
                else:
                    freq_dict_word[token]=1

        return freq_dict_word

    def get_frequency_of_all_corpus_words(self, raw_data):
        freq_dict_word = {}
        for review in raw_data:
            tokens = word_tokenize(review)
            for token in tokens:
                if token in freq_dict_word:
                    freq_dict_word[token]+=1
                else:
                    freq_dict_word[token]=1

        return freq_dict_word

    # (log(the future) / (log(the)*log(future)) )* len(dict)

    def get_pmi_score_for_ngram(self, freq_dict_word,freq_dict_ngram, ngram):
        ''' computes PMI score for an n-gram '''
        tokens = ngram.split('_')
        if ngram not in freq_dict_ngram:
            return 0

        freq_ngram = freq_dict_ngram[ngram]
        word_freqs = []

        for token in tokens:
            if token not in freq_dict_word:
                return 0
            word_freqs.append(freq_dict_word[token])

        product = 1
        for element in word_freqs:
            product*=element

        return math.log ( (freq_ngram * len(freq_dict_word) / product ) , 2 )

    def get_pmi_scores_for_all_ngrams(self, freq_dict_ngram, freq_dict_word):
        ''' computes PMI scores for all the ngrams in the corpus '''
        pmi_scores_dict = {}
        for word in freq_dict_ngram:
            pmi_scores_dict[word] = (freq_dict_ngram[word], self.get_pmi_score_for_ngram(freq_dict_word, freq_dict_ngram, word))
        pmi_scores_dict = dict(sorted(pmi_scores_dict.items(), key=lambda item: item[1][1], reverse=True))

        return pmi_scores_dict

    def filter_ngrams_by_pmi(self, pmi_scores_df, pmi_threshold):
        ''' return only those ngrams having PMI score above the mentioned PMI_threshold value '''
        new_df = pmi_scores_df.sort_values(by='Frequency',ascending=False)
        #new_df

        pmi_list = list(new_df['PMI'])
        temp = [i for i in range(len(pmi_list)) if pmi_list[i]>pmi_threshold] #0 value can be replaced by any threshold
        #temp

        filtered_df = new_df.iloc[temp,:]
        return filtered_df

    def create_csv_from_dict(self, data,filename,column_names):
        ''' creates .csv file for the given dictionary data '''
        
        df = pd.DataFrame.from_dict(data, orient = 'index')
        df.columns = column_names
        df.to_csv(filename,header=True)
        return df

# Finding N-grams

In [6]:
class FindNgrams():
    ''' this class contains the flow of execution for finding ngrams in the corpus '''
    def __init__(self,object_dict):
        
        self.input_data_path = object_dict['input_data_path']
        self.output_data_path = object_dict['output_data_path']
        
        self.filename = object_dict['input_filename']
        self.encoding = object_dict['encoding']
        
        self.pmi_threshold = int(object_dict['PMI_threshold'])
        self.ngram_results_dir = object_dict['ngram_results_dir']
        self.pmi_results_dir = object_dict['pmi_results_dir']
        
        self.ngram_processed_data = object_dict['ngram_processed_data']
        
        self.reviews = list(pd.read_csv(self.input_data_path + self.filename, encoding=self.encoding)['text'])
        self.sentiment_class = list(pd.read_csv(self.input_data_path + self.filename, encoding=self.encoding)['sentiment_class'])
        
        self.reviews = self.reviews[:100] + self.reviews[-100:]
        self.sentiment_class = self.sentiment_class[:100] + self.sentiment_class[-100:]
        
        #---------------------- Ngrams main function ------------------------
        import os.path
        
        if not os.path.exists(self.output_data_path + self.ngram_results_dir +'/NgramProcessedData.csv'):
            
            collocations = Collocations()

            self.unigrams = list(collocations.read_input_token(self.reviews))
            self.unigrams_df = pd.DataFrame(self.unigrams)
            self.unigrams_df.to_csv(self.output_data_path + self.ngram_results_dir +'/WordTokensForEachDoc.csv',index=False) #stores all unigrams in the corpus document wise (simple word tokenization)
            #unigrams

            self.all_unigrams = []
            for i in self.unigrams:
                self.all_unigrams.extend(i)

            self.unique_unigrams = list(set(self.all_unigrams))
            self.unique_unigrams_df = pd.DataFrame(self.unique_unigrams)
            self.unique_unigrams_df.to_csv(self.output_data_path + self.ngram_results_dir +'/UnigramsOfCorpus.csv',index=False) #list of all the unigrams in the corpus (no duplicates)

            self.ngrams = list(collocations.read_input_retokenization(self.reviews))

            self.new_ngrams = []
            for i in self.ngrams:
                self.new_ngrams.append(list(filter(None,i)))

            self.new_ngrams_df = pd.DataFrame([self.new_ngrams]).transpose()
            self.new_ngrams_df.columns=['Ngram tokens']
            self.new_ngrams_df.to_csv(self.output_data_path + self.ngram_results_dir +'/Ngrams.csv',index=False) #stores all ngrams document wise

            self.ngrams_dict = collocations.find_ngrams_dict(self.ngrams)
            self.ngrams_dict_df = pd.DataFrame.from_dict(self.ngrams_dict, orient='index')
            self.ngrams_dict_df.to_csv(self.output_data_path + self.ngram_results_dir +'/NgramsDictForEachDoc.csv',index=False) #stores all ngrams except unigrams (document wise)
            #ngrams_dict

            self.ngram_replaced_reviews = collocations.replace_ngrams(self.reviews, self.ngrams_dict)
            #ngram_replaced_reviews

            self.ngram_replaced_reviews_df = collocations.make_new_csv(self.ngram_replaced_reviews, self.output_data_path + self.ngram_results_dir +'/NgramProcessedData.csv', self.sentiment_class) #replaces ngrams in the original corpus with _
            #ngram_replaced_reviews_df
        
        else:
            self.ngram_replaced_reviews_df = pd.read_csv(self.output_data_path + self.ngram_results_dir +'/NgramProcessedData.csv') #replaced ngrams in the original corpus with _
            #ngram_replaced_reviews_df
        
        #------------------------------ PMI main function -----------------------------
        
        if not os.path.exists(self.output_data_path + self.pmi_results_dir +'/FilteredNgramsByPMI.csv'):
        
            pmi = PMI()

            self.raw_data = self.reviews
            self.raw_data = [i.lower() for i in self.raw_data]

            self.reviews_list = list(pd.read_csv(self.output_data_path + self.ngram_results_dir +'/NgramProcessedData.csv')['text']) #reviews after finding ngrams
            #reviews_list

            self.ngrams_list = pmi.find_all_ngrams_from_corpus(self.reviews_list)
            self.ngrams_list_df = pd.DataFrame(self.ngrams_list).to_csv(self.output_data_path + self.pmi_results_dir +'/Ngrams.csv',index=False) #stores all the ngrams from corpus

            self.freq_dict_ngram = pmi.get_frequency_of_all_ngrams(self.ngrams_list, self.reviews_list)
            self.freq_dict_ngram_df = pmi.create_csv_from_dict(self.freq_dict_ngram, self.output_data_path + self.pmi_results_dir +'/NgramFrequencies.csv',['Frequency']) #stores the frequency of each ngram

            self.freq_sorted_dict = dict(sorted(self.freq_dict_ngram.items(), key=lambda item: item[1], reverse=True))
            #freq_sorted_dict

            self.freq_dict_word = pmi.get_frequency_of_all_corpus_words(self.raw_data)
            self.freq_dict_word_df = pmi.create_csv_from_dict(self.freq_dict_word, self.output_data_path + self.pmi_results_dir +'/WordFrequencies.csv',['Frequency']) #stores the frequency of each unigram in the corpus
            #freq_dict_word

            self.pmi_scores_dict = pmi.get_pmi_scores_for_all_ngrams(self.freq_dict_ngram, self.freq_dict_word)
            self.pmi_scores_df = pmi.create_csv_from_dict(self.pmi_scores_dict, self.output_data_path + self.pmi_results_dir +'/NgramPMIScores.csv',['Frequency','PMI']) #stores the frequency and PMI score for each ngram
            #pmi_scores_dict

            self.filtered_pmi_df = pmi.filter_ngrams_by_pmi(self.pmi_scores_df, self.pmi_threshold)
            self.filtered_pmi_df.to_csv(self.output_data_path + self.pmi_results_dir +'/FilteredNgramsByPMI.csv')
            #filtered_pmi_df
        
        else:
            self.filtered_pmi_df = pd.read_csv(self.output_data_path + self.pmi_results_dir +'/FilteredNgramsByPMI.csv')
            

# Create Configuration files

In [7]:
# Run configuration file (CollocationsConfiguration.py)
%run ../conf/CollocationsConfiguration.py

# Main function

In [8]:
if __name__ == '__main__':
    
    from configparser import ConfigParser
    config = ConfigParser()
    config.read('../conf/collocations.ini') #read collocations configuration file
    
    #read the values from configuration file and store them in a dictionary for further usage
    object_dict = dict()
    
    object_dict['input_data_path'] = config['Collocations']['input_data_path']
    object_dict['output_data_path'] = config['Collocations']['output_data_path']
    object_dict['input_filename'] = config['Collocations']['input_filename']
    object_dict['encoding'] = config['Collocations']['encoding']
    object_dict['ngram_results_dir'] = config['Collocations']['ngram_results_dir']
    object_dict['pmi_results_dir'] = config['Collocations']['pmi_results_dir']
    
    object_dict['PMI_threshold'] = config['Collocations']['PMI_threshold']
    
    object_dict['ngram_processed_data'] = object_dict['input_filename'].split('.')[0] + '_NgramsProcessed.csv'
    
    print(object_dict)
    
    import os
    try:
        os.mkdir('../results')
        os.mkdir('../results/Collocations_Results')
        os.mkdir(object_dict['output_data_path'] + object_dict['ngram_results_dir'])
        os.mkdir(object_dict['output_data_path'] + object_dict['pmi_results_dir'])
        
    except FileExistsError:
        pass
    
    
    result = FindNgrams(object_dict) #Find ngrams from the corpus

{'input_data_path': '../data/', 'output_data_path': '../results/Collocations_Results/', 'input_filename': 'IMDBtrain.csv', 'encoding': 'latin', 'ngram_results_dir': 'Ngram_Results', 'pmi_results_dir': 'PMI_Results', 'PMI_threshold': '0', 'ngram_processed_data': 'IMDBtrain_NgramsProcessed.csv'}
Gensim: Processing para =  0
Gensim: Processing para =  10
Gensim: Processing para =  20
Gensim: Processing para =  30
Gensim: Processing para =  40
Gensim: Processing para =  50
Gensim: Processing para =  60
Gensim: Processing para =  70
Gensim: Processing para =  80
Gensim: Processing para =  90
Gensim: Processing para =  100
Gensim: Processing para =  110
Gensim: Processing para =  120
Gensim: Processing para =  130
Gensim: Processing para =  140
Gensim: Processing para =  150
Gensim: Processing para =  160
Gensim: Processing para =  170
Gensim: Processing para =  180
Gensim: Processing para =  190
Retokenization: Processing para =  0
Retokenization: Processing para =  10
Retokenization: Proce