In [1]:
from __future__ import division
import operator
import nltk
import string
import numpy as np
import pandas as pd

In [2]:
def isPunct(word):
    return len(word) == 1 and word in string.punctuation

def isNumeric(word):
    try:
        float(word) if '.' in word else int(word)
        return True
    except ValueError:
        return False

In [3]:
class RakeKeywordExtractor:

    def __init__(self):
        self.stopwords = set(nltk.corpus.stopwords.words())
        self.top_fraction = 1 # consider top third candidate keywords by score

    def _generate_candidate_keywords(self, sentences):
        phrase_list = []
        for sentence in sentences:
            words = map(lambda x: "|" if x in self.stopwords else x,nltk.word_tokenize(sentence.lower()))
            ##words => NONSTOPWORD | NONSTOPWORD NONSTOPWORD NONSTOPWORD | | NONSTOPWORD | NONSTOPWORD NONSTOPWORD |
            phrase = []
            for word in words:
                if word == "|" or isPunct(word):
                    if len(phrase) > 0:
                        phrase_list.append(phrase)#Got At least 1 NonStopWord
                    phrase = []#Prepare for Next Continous NonStopWords
                else:
                    phrase.append(word)#NonStopWord
        return phrase_list

    def _calculate_word_scores(self, phrase_list):
        word_freq = nltk.FreqDist()
        word_degree = nltk.FreqDist()
        for phrase in phrase_list:
            degree = len(list(filter(lambda x: not isNumeric(x), phrase))) #Number of Distinct Words in the Phrase
            for word in phrase:
                word_freq[word] += 1
                word_degree[word]+=degree
        # word score = deg(w) / freq(w)
        word_scores = {}
        for word in word_freq.keys():
            word_scores[word] = word_degree[word] / word_freq[word]
        return word_scores

    def _calculate_phrase_scores(self, phrase_list, word_scores):
        phrase_scores = {}
        for phrase in phrase_list:
            phrase_score = 0
            for word in phrase:
                phrase_score += word_scores[word]
                phrase_scores[" ".join(phrase)] = phrase_score
        return phrase_scores
    
    def extract(self, text, incl_scores=False):
        sentences = nltk.sent_tokenize(text)
        phrase_list = self._generate_candidate_keywords(sentences)
        word_scores = self._calculate_word_scores(phrase_list)
        phrase_scores = self._calculate_phrase_scores(phrase_list, word_scores)
        sorted_phrase_scores = sorted(phrase_scores.items(),
        key=operator.itemgetter(1), reverse=True)
        n_phrases = len(sorted_phrase_scores)
        if incl_scores:
            return sorted_phrase_scores[0:int(n_phrases/self.top_fraction)]
        else:
            return map(lambda x: x[0],sorted_phrase_scores[0:int(n_phrases/self.top_fraction)])

In [4]:
def test(feedback):
    rake = RakeKeywordExtractor()
    keywords = rake.extract(feedback, incl_scores=True)
    return keywords
f1="""
    Compatibility of systems of linear constraints over the set of natural 
    numbers. Criteria of compatibility of a system of linear Diophantine 
    equations, strict inequations, and nonstrict inequations are considered. 
    Upper bounds for components of a minimal set of solutions and algorithms 
    of construction of minimal generating sets of solutions for all types of 
    systems are given. These criteria and the corresponding algorithms for 
    constructing a minimal supporting set of solutions can be used in solving 
    all the considered types of systems and systems of mixed types.
    """ 
test(f1)

[('minimal generating sets', 8.666666666666666),
 ('linear diophantine equations', 8.5),
 ('minimal supporting set', 7.666666666666666),
 ('minimal set', 4.666666666666666),
 ('linear constraints', 4.5),
 ('strict inequations', 4.0),
 ('natural numbers', 4.0),
 ('upper bounds', 4.0),
 ('nonstrict inequations', 4.0),
 ('mixed types', 3.666666666666667),
 ('corresponding algorithms', 3.5),
 ('considered types', 3.166666666666667),
 ('set', 2.0),
 ('types', 1.6666666666666667),
 ('considered', 1.5),
 ('algorithms', 1.5),
 ('systems', 1.0),
 ('solutions', 1.0),
 ('solving', 1.0),
 ('system', 1.0),
 ('criteria', 1.0),
 ('construction', 1.0),
 ('given', 1.0),
 ('used', 1.0),
 ('constructing', 1.0),
 ('compatibility', 1.0),
 ('components', 1.0)]

In [5]:
import json
from pprint import pprint
def parse(filename):
    with open(filename) as data_file:
        data = json.load(data_file) 
        return data

In [6]:
parsedData=parse("Musical_Instruments_5.json")

In [7]:
commentText=[]
rakeTags=[]
amazonMusicReviewDF=pd.DataFrame(columns=["ReviewText","RakePhrases"])
for i in np.arange(10):
    amazonMusicReviewDF.loc[i]=[parsedData[i]["reviewText"],test(parsedData[i]["reviewText"])]

In [8]:
amazonMusicReviewDF.loc[3].ReviewText,amazonMusicReviewDF.loc[3].RakePhrases

('Nice windscreen protects my MXL mic and prevents pops. Only thing is that the gooseneck is only marginally able to hold the screen in position and requires careful positioning of the clamp to avoid sagging.',
 [('requires careful positioning', 9.0),
  ('nice windscreen protects', 9.0),
  ('avoid sagging', 4.0),
  ('marginally able', 4.0),
  ('mxl mic', 4.0),
  ('prevents pops', 4.0),
  ('thing', 1.0),
  ('gooseneck', 1.0),
  ('screen', 1.0),
  ('position', 1.0),
  ('clamp', 1.0),
  ('hold', 1.0)])

In [9]:
i=4
amazonMusicReviewDF.loc[i].ReviewText,amazonMusicReviewDF.loc[i].RakePhrases

("This pop filter is great. It looks and performs like a studio filter. If you're recording vocals this will eliminate the pops that gets recorded when you sing.",
 [("'re recording vocals", 9.0),
  ('pop filter', 4.0),
  ('studio filter', 4.0),
  ('performs like', 4.0),
  ('gets recorded', 4.0),
  ('eliminate', 1.0),
  ('sing', 1.0),
  ('looks', 1.0),
  ('pops', 1.0),
  ('great', 1.0)])

In [10]:
i=7
amazonMusicReviewDF.loc[i].ReviewText,amazonMusicReviewDF.loc[i].RakePhrases

("I now use this cable to run from the output of my pedal chain to the input of my Fender Amp. After I bought Monster Cable to hook up my pedal board I thought I would try another one and update my guitar. I had been using a high end Planet Waves cable that I bought in the 1980's... Once I found out the input jacks on the new Monster cable didn't fit into the Fender Strat jack I was a little disappointed... I didn't return it and as stated I use it for the output on the pedal board. Save your money... I went back to my Planet Waves Cable...I payed $30.00 back in the eighties for the Planet Waves which now comes in at around $50.00. What I'm getting at is you get what you pay for. I thought Waves was a lot of money back in the day...but I haven't bought a guitar cable since this one...20 plus years and still working...Planet Waves wins.",
 [('still working ... planet waves wins', 28.75),
  ('one ... 20 plus years', 19.6),
  ('would try another one', 16.0),
  ('planet waves cable ...', 1

In [107]:
class RakeWithPMIKeywordExtractor:

    def __init__(self):
        self.stopwords = set(nltk.corpus.stopwords.words())
        self.top_fraction = 1 # consider top third candidate keywords by score

    def _generate_candidate_keywords(self, sentences):
        phrase_list = []
        for sentence in sentences:
            words = map(lambda x: "|" if x in self.stopwords else x,nltk.word_tokenize(sentence.lower()))
            ##words => NONSTOPWORD | NONSTOPWORD NONSTOPWORD NONSTOPWORD | | NONSTOPWORD | NONSTOPWORD NONSTOPWORD |
            phrase = []
            for word in words:
                if word == "|" or isPunct(word):
                    if len(phrase) > 0:
                        phrase_list.append(phrase)#Got At least 1 NonStopWord
                    phrase = []#Prepare for Next Continous NonStopWords
                else:
                    phrase.append(word)#NonStopWord
        return phrase_list

    def _chkAndAddDictEntry(self,dictObj,key,value=1):
        if dictObj.get(key)==None:
            dictObj[key]=value
        else:
            dictObj[key]+=value
    def _chkAndInitiateDictEntry(self,dictOfDictObj,key,initialDict):
        if dictOfDictObj.get(key)==None:
            dictOfDictObj[key]=initialDict;
        return dictOfDictObj[key]
    #This implementation of PMI does ignore reapeating words in a KeyPhrase
    def _preparePMIDPMatrix(self,all_phrase_lists):
        corpusLevelDict={}#It is Symmetric corpusLevelDict["w1"]["w2"]=corpusLevelDict["w2"]["w1"]
        wordLevelDict={}
        for phrase_list_all in all_phrase_lists:
            phrase_list=set(phrase_list_all)
            for word in phrase_list:
                wordLevelDict=self._chkAndInitiateDictEntry(corpusLevelDict,word,{})
                for neighWord in phrase_list:
                    if word==neighWord:
                        continue
                    self._chkAndAddDictEntry(wordLevelDict,neighWord)
        corpusDictSz=len(corpusLevelDict.keys())         
        print("Corpus Dictionary Size ",corpusDictSz)
        print("Total KeyPhrases ",len(all_phrase_lists))
        return corpusLevelDict

In [108]:
def getListOfPharases(reviewDF,fieldName):
    listOfPhrases=[]
    rakePhrases=reviewDF[fieldName]
    for i in np.arange(len(rakePhrases)):
        rakePhrase=rakePhrases[i]
        for j in np.arange(len(rakePhrase)):
            listOfPhrases.append(rakePhrase[j][0].split(" "))
    return listOfPhrases
def testPMI():
    listOfPhrases=getListOfPharases(amazonMusicReviewDF,"RakePhrases")
    rake = RakeWithPMIKeywordExtractor()
    return rake._preparePMIDPMatrix(listOfPhrases)

In [109]:
topLevelDict=testPMI()

Corpus Dictionary Size  239
Total KeyPhrases  187


In [118]:
print(topLevelDict["lowest"])
print(topLevelDict["pop"])
print(topLevelDict["filters"])
print(topLevelDict["filter"])

{'prices': 1, 'filters': 1, 'pop': 1}
{'filters': 1, 'sounds': 1, 'prices': 1, 'filter': 3, 'lowest': 1, 'next': 1}
{'prices': 1, 'lowest': 1, 'pop': 1}
{'studio': 1, 'pop': 3, 'cloth': 1, 'double': 1, 'blocks': 1, 'next': 1}


In [None]:
listOfPhrases

In [None]:
corpusLevelDict

In [35]:
p1=[["ab","w2","w3"],["cd","w1","w2","cd"],[]]
for phrase_list_all in p1:
    phrase_list=set(phrase_list_all)
    print(phrase_list)

{'w2', 'ab', 'w3'}
{'w1', 'cd', 'w2'}
set()


In [20]:
topLevelDict

{'hj': 156, 'nota': {'notb': 45.1435}, 'pyth': {}}

In [24]:
len(topLevelDict.keys())

3

In [28]:
amazonMusicReviewDF.to_excel("AmazonMusicReview_RakeKeyPharses.xls")