# Parsing data files

NLTK already contains the required corpora. The simplest way is to reuse the NLTK reviews reader and the Review object:

In [1]:
import nltk
import numpy as np
from nltk import word_tokenize
from nltk.corpus import product_reviews_1
from textblob import TextBlob # for noun phrase extraction
from apyori import apriori # For the Apriori algorithm
from numpy import sign
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import time

nltk.download('product_reviews_1')
camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
r = camera_reviews[0]
r.review_lines[0].features


[nltk_data] Downloading package product_reviews_1 to
[nltk_data]     /Users/andrejwork/nltk_data...
[nltk_data]   Package product_reviews_1 is already up-to-date!


[('canon powershot g3', '+3')]

# Preprocessing

For the preprocessing

In [25]:
class PSent:
    def __init__(self, s, f):
        self.raw = s # Sentence in raw format
        self.pp = [] # Sentence after preprocessing
        self.ft = {} # Features / polarity based on model
        self.op = [] # Opinion words
        self.test = f # Test features labelled by human
        self.eval = {} # Evaluation score
        
    def sentiment(self):
        # Detecting sentiment polarity in the sentences                 
        if self.ft:
            # print("\n"+str(s))
            p = TextBlob(str(self)).sentiment.polarity
            if( p > 0 ):
                p = 1                
            elif( p < 0 ):
                p = -1
            # In base model there is no distintion
            # between different features within one
            # sentence:
            for f in self.ft.keys():
                self.ft[f] = p
        
    def __repr__(self):
        return "PSent"
    
    def __str__(self):
        return ' '.join(self.raw)
        
class PRev:
    def __init__(self, r):
        self.NltkReview = r
        self.review = []        
        self.eval = {} # Evaluation score
        
    def preprocess(self, 
                         spelling = False,
                         stemming = False,
                         lemmatization = True):
        
        es = nltk.stem.SnowballStemmer('english')
        
        self.review = []
        
        for rl in self.NltkReview.review_lines:
            
            ps = PSent(rl.sent, 
                       rl.features) # Human labelled features
            
            a = ' '.join(rl.sent)
            a = TextBlob(a)

            # 2. spelling correction 
            if(spelling):                    
                a = a.correct() 
            
            # 3. get all nouns
            nouns = []
            for pos in a.tags:
                if (pos[1][:2] == 'NN' 
                   and len(pos[0])>=3):
                    nouns.append(pos[0])

            # 4. Chunking noun phrases
            nouns = a.noun_phrases + nouns

            # 5. Lemmatication            
            if(lemmatization):
                nouns = [n.lemmatize() for n in nouns]
            
            
            # 5. Stemming the words in the noun phrases
            if(stemming):      
                nouns = [es.stem(n) for n in nouns]
            
            ps.pp=nouns
            self.review.append(ps)
            
    def sents(self):
        return [s.pp for s in self.review]
    
    def sents_str(self):
        rs = ""
        for s in self.review:
            rs +=' '.join(s.pp)
        return rs
            
    def sentiment(self):
        # Detecting sentiment polarity in the sentences
        # containing product features        
        for s in self.review:               
            s.sentiment()
            
    def __str__(self):
        rstr = self.NltkReview.title
        for s in self.NltkReview.sents():
            rstr += ' '.join(s)+'\n'
        return rstr

    
class PReviews:
    
    def __init__(self, nltk_corpus):  
        self.name = nltk_corpus  
        self.NltkCorpus = product_reviews_1.reviews(nltk_corpus)
        self.eval = {} # Evaluation score
        self.report = {}
        self.test_report = {}
        
        self.revs = []
        for r in self.NltkCorpus:
            self.revs.append(PRev(r))
            
        print(len(self.revs))
        
    def preprocess(self, 
                         spelling = False,
                         stemming = False):
        
        start = time.time()

        for r in self.revs:
            r.preprocess(spelling = spelling,
                         stemming = stemming)
            
        end = time.time()

        print("Preprocessed {} reviews in {:.2f} seconds (spelling correction={}, stemming={})"
              .format(len(self.revs), end-start, spelling, stemming))
        
    def get_tfidf_top_features(self, documents,n_top=10):
        tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
        tfidf = tfidf_vectorizer.fit_transform(documents)
        importance = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[::-1]
        tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names())
        return tfidf_feature_names[importance[:n_top]]

    def features(self,
                 apr = True,
                 min_support=0.005, 
                 min_confidence=0.2, 
                 min_lift=3, 
                 min_length=3):
        
        if not self.revs:
            self.preprocess()
        
        sentence_dict = []
        reviews_dict = []
        for r in self.revs:
            reviews_dict.append(r.sents_str()) # Each review as string (for TFIDF
            sentence_dict.extend(r.sents())            
            
        
        top_n = list(self.get_tfidf_top_features(reviews_dict, n_top=20))
        
                       
        # Applying Apriori algorithm to find potential features:
        self.association_rules = apriori(sentence_dict, 
                                    min_support=min_support, 
                                    min_confidence=min_confidence, 
                                    min_lift=min_lift, 
                                    min_length=min_length)
        
        association_results = list(self.association_rules)
        for f in association_results:
            top_n.extend(list(f.items))
        
        # print(top_n)
        # Find features in individual revies (per sentence)
        # Tag those sentences where features were found
        for r in self.revs:
            for s in r.review:
                found = False
                for word in top_n:
                    # print("Check if {} in {}".format(word, s.pp))
                    if word in s.pp:                        
                        # print("Found feature {} in {}".format(word, s.pp))
                        s.ft[word]=0
                        
                # print("Checking review for features")                
                # for f in association_results:
                #    feat = list(f.items)  
                #    # print(feat)
                #            
                #    found = False
                #    for word in feat:
                #        # print("Check if {} in {}".format(e, s))
                #        found = False
                #        for np in s.pp:
                #            if word == np or word in np:
                #                # print("Found feature {} in {}".format(e, np))
                #                found = True
                #                break
                #        if not found:
                #            break                
                #    if(found):                        
                #        featstr = feat[1]+' '+feat[0]
                #        # print("Found {} in {}".format(feat[::-1], s))
                #        # s.ft.append(feat[::-1])
                #        s.ft[featstr]=0                
        
        return top_n
    
    def opinions(self):
        # Extracting opinion words (adjectives and adverbs)
        # JJ adjective, JJR adj comparative, JJS adj superlative
        # RB adverb, RBR comparative, RBS seperlative
        for r in self.revs:
            for s in r.review:               
                for w in TextBlob(str(s)).tags:
                    if (w[1][:2] == 'JJ' or
                       w[1][:2] == 'RB'):
                            # print(w[0])
                            s.op.append(w[0])
    
    def sentiment(self):
        # Detecting sentiment polarity in the sentences
        # containing product features
        for r in self.revs:
            r.sentiment()
                        
    def gen_report(self):
        
        # Generate an product feature report
        # Collect sentiment per feature from each review and
        # store in tuple dict[feature] = (positive_count, negative_count)
        
        for r in self.revs:
            for s in r.review:               
                for feat, polarity in s.ft.items():
                    self.report.setdefault(feat, (0,0))
                    if polarity > 0:
                        self.report[feat] = (self.report[feat][0]+1,
                                             self.report[feat][1])
                    elif polarity < 0:                
                        self.report.setdefault(feat, (0,0))
                        self.report[feat] = (self.report[feat][0],
                                             self.report[feat][1]+1)
            
        return self.report
    
    def print_report(self):
        if not self.report:
            self.gen_report()
            
        print("\nProduct: ", self.name)
        for feat, score in self.report.items():
            print("\t\nFeature: ", feat)
            print("\t\tPositive: ", self.report[feat][0])
            print("\t\tNegative: ", self.report[feat][1])
    
    def extract_tagged_data(self):
        self.test_report = {}
        for r in self.NltkCorpus:
            # print("\n")
            for f in r.features():
                feat = f[0] # name of feature
                score_sign = f[1][0] #just the plus or minus sign
                self.test_report.setdefault(feat, (0,0))
                if score_sign == '+':
                    self.test_report[feat] = (self.test_report[feat][0]+1,
                                                 self.test_report[feat][1])
                elif score_sign == '-':
                    self.test_report[feat] = (self.test_report[feat][0],
                                                 self.test_report[feat][1]+1)
        #print("\nProduct: ", self.name)
        # for feat, score in test_report.items():
        #    print("\t\nFeature: ", feat)
        #    print("\t\tPositive: ", test_report[feat][0])
        #    print("\t\tNegative: ", test_report[feat][1])
        return self.test_report
    
    def fscore(self, tp, fn, fp):
        recall = 0
        prec = 0
        fscore = 0
        
        if(tp+fn)>0:
            recall = tp/(tp+fn) 
        
        if(tp+fp)>0:
            prec = tp/(tp+fp) 
            
        if(recall+prec)>0:
            fscore = 2*(recall*prec)/(recall+prec)
        
        return recall, prec, fscore
    
    def feat_evaluation(self):
        if not self.test_report:
            self.extract_tagged_data()
            
        if not self.report:
            self.gen_report()
            
        # Evaluation of feature extraction overall
        # Look at features that were picked up by mining and
        # those that were missed:
        # cft = len(test_report.keys()) # total features in test data
        # cfm = len(my_report.keys()) # total features in my model data
        # cfmatch = 0 # count of matching features
        f_tp = []
        f_fp = []
        f_fn = []
        
        for feat in self.report.keys():
            if feat in self.test_report:
                # cfmatch += 1
                f_tp.append(feat)               
            else:                
                f_fp.append(feat)                
                
        for feat in self.test_report.keys():
            if feat not in self.report:                
                f_fn.append(feat)
        
        recall, prec, fscore = self.fscore(len(f_tp),
                                           len(f_fn),
                                           len(f_fp))
    
        print("Looking at all product features together:")
        print("|\tTP\t|\tFN\t|\tFP\t|\tRecall\t|\tPrec\t|\tF score\t|")
        print("|\t{0}\t|\t{1}\t|\t{2}\t|\t{3:.2f}\t|\t{4:.2f}\t|\t{5:.2f}\t|".format(len(f_tp), 
                                                                                     len(f_fn), 
                                                                                     len(f_fp),
                                                                                    recall,
                                                                                    prec,
                                                                                    fscore))
        
        print("\nTP")
        print('%s' % ', '.join(map(str, f_tp)))

        print("\nFN")
        print('%s' % ', '.join(map(str, f_fn)))

        print("\nFP")
        print('%s' % ', '.join(map(str, f_fp)))

        
        # Calculate recall / precision and F1 score per sentence, review and product:
        self.eval["tp"] = 0 # true positive (labelled feature found in mined features)
        self.eval["fn"] = 0 # false negative (labelled feature not found in mined features)
        self.eval["fp"] = 0 # false positive, mined feature that is not present in labelled
        self.eval["recall"] = 0 
        self.eval["prec"] = 0 
        self.eval["fscore"] = 0 
        for r in self.revs:
            r.eval["tp"] = 0 # true positive (labelled feature found in mined features)
            r.eval["fn"] = 0 # false negative (labelled feature not found in mined features)
            r.eval["fp"] = 0 # false positive, mined feature that is not present in labelled
            r.eval["recall"] = 0 
            r.eval["prec"] = 0 
            r.eval["fscore"] = 0 
            for s in r.review: # iterate through each sentence
                s.eval["tp"] = 0 # true positive (labelled feature found in mined features)
                s.eval["fn"] = 0    # false negative (labelled feature not found in mined features)
                s.eval["fp"] = 0    # false positive, mined feature that is not present in labelled
                s.eval["recall"] = 0 
                s.eval["prec"] = 0 
                s.eval["fscore"] = 0 
                for lf in s.test: # iterate through labelled features
                    if lf[0] in s.ft.keys(): # comparing to mined features
                        s.eval["tp"] += 1
                    else:                        
                        s.eval["fn"] += 1
                for mf in s.ft.keys(): # iterate through labelled features
                    found = False
                    for lf in s.test:                        
                        if mf == lf[0]:
                            found = True
                            break
                    if not found:
                        s.eval["fp"] += 1                        
                        
                r.eval["tp"] += s.eval["tp"] # true positive (labelled feature found in mined features)
                r.eval["fn"] += s.eval["fn"] # false negative (labelled feature not found in mined features)
                r.eval["fp"] += s.eval["fp"] # false positive, mined feature that is not present in labelled
                
                # Recall / precision / F1 score per sentence:
                s.eval["recall"], s.eval["prec"], s.eval["fscore"] = self.fscore(s.eval["tp"],
                                                                               s.eval["fn"],
                                                                               s.eval["fp"])
                
            self.eval["tp"] += r.eval["tp"] # true positive (labelled feature found in mined features)
            self.eval["fn"] += r.eval["fn"] # false negative (labelled feature not found in mined features)
            self.eval["fp"] += r.eval["fp"] # false positive, mined feature that is not present in labelled
                
            # Recall / precision / F1 score per review:
            r.eval["recall"], r.eval["prec"], r.eval["fscore"] = self.fscore(r.eval["tp"],
                                                                           r.eval["fn"],
                                                                           r.eval["fp"])
            
        # Recall / precision / F1 score per product:
        self.eval["recall"], self.eval["prec"], self.eval["fscore"] = self.fscore(self.eval["tp"],
                                                                                   self.eval["fn"],
                                                                                   self.eval["fp"])
        
        print("\n\nLooking at product features per individual sentence:")
        print("|\tTP\t|\tFN\t|\tFP\t|\tRecall\t|\tPrec\t|\tF score\t|")
        print("|\t{0}\t|\t{1}\t|\t{2}\t|\t{3:.2f}\t|\t{4:.2f}\t|\t{5:.2f}\t|"
              .format(self.eval["tp"],
                    self.eval["fn"],
                    self.eval["fp"],
                    self.eval["recall"],
                    self.eval["prec"],
                    self.eval["fscore"]))
        
    def sent_evaluation(self):   
        
        if not self.test_report:
            self.extract_tagged_data()
            
        if not self.report:
            self.gen_report()
            
        # First collect features that were correctly mined
        f_tp = []        
        for feat in self.report.keys():
            if feat in self.test_report:
                f_tp.append(feat)

        # Calculate recall / precision and F1 score per sentence, review and product
        # on those sentences that contain features that were correctly mined:
        self.eval["stp"] = 0 # true positive (positive feature labelled positive)
        self.eval["sfn"] = 0 # false negative (positive feature not labelled positive)
        self.eval["sfp"] = 0 # false positive (negative feature labelled positive)
        self.eval["srecall"] = 0 
        self.eval["sprec"] = 0 
        self.eval["sfscore"] = 0 
        for r in self.revs:
            r.eval["stp"] = 0 # true positive (positive feature labelled positive)
            r.eval["sfn"] = 0 # false negative (positive feature not labelled positive)
            r.eval["sfp"] = 0 # false positive (negative feature labelled positive)
            r.eval["srecall"] = 0 
            r.eval["sprec"] = 0 
            r.eval["sfscore"] = 0 
            for s in r.review: # iterate through each sentence
                s.eval["stp"] = 0 # true positive (positive feature labelled positive)
                s.eval["sfn"] = 0    # false negative (positive feature not labelled positive)
                s.eval["sfp"] = 0    # false positive (negative feature labelled positive)
                s.eval["srecall"] = 0 
                s.eval["sprec"] = 0 
                s.eval["sfscore"] = 0 
                for lf in s.test: # iterate through labelled features
                    if lf[0] in s.ft: # comparing to mined features
                        if sign(int(lf[1])) == s.ft[lf[0]]:
                            s.eval["stp"] += 1
                            # print("TP {}:{}=={}:{}".format(lf[0],sign(int(lf[1])),lf[0], s.ft[lf[0]]))
                        elif sign(int(lf[1])) == 1.0:
                            s.eval["sfn"] += 1
                            # print("Falsely negative:")
                            # print(s)
                        else:
                            s.eval["sfp"] += 1
                            # print("Falsely positive:")
                            # print(s)
                        
                r.eval["stp"] += s.eval["stp"] # true positive (positive feature labelled positive)
                r.eval["sfn"] += s.eval["sfn"] # false negative (positive feature not labelled positive)
                r.eval["sfp"] += s.eval["sfp"] # false positive (negative feature labelled positive)
                
                # Recall / precision / F1 score per sentence:
                s.eval["srecall"], s.eval["sprec"], s.eval["sfscore"] = self.fscore(s.eval["stp"],
                                                                               s.eval["sfn"],
                                                                               s.eval["sfp"])
                
            self.eval["stp"] += r.eval["stp"] # true positive (positive feature labelled positive)
            self.eval["sfn"] += r.eval["sfn"] # false negative (positive feature not labelled positive)
            self.eval["sfp"] += r.eval["sfp"] # false positive (negative feature labelled positive)
                
            # Recall / precision / F1 score per review:
            r.eval["srecall"], r.eval["sprec"], r.eval["sfscore"] = self.fscore(r.eval["stp"],
                                                                           r.eval["sfn"],
                                                                           r.eval["sfp"])
            
        # Recall / precision / F1 score per product:
        self.eval["srecall"], self.eval["sprec"], self.eval["sfscore"] = self.fscore(self.eval["stp"],
                                                                                   self.eval["sfn"],
                                                                                   self.eval["sfp"])

        print("\n\nLooking at sentiment evaluation per individual sentence:")
        print("|\tTP\t|\tFN\t|\tFP\t|\tRecall\t|\tPrec\t|\tF score\t|")
        print("|\t{0}\t|\t{1}\t|\t{2}\t|\t{3:.2f}\t|\t{4:.2f}\t|\t{5:.2f}\t|"
              .format(self.eval["stp"],
                    self.eval["sfn"],
                    self.eval["sfp"],
                    self.eval["srecall"],
                    self.eval["sprec"],
                    self.eval["sfscore"]))
                    
                    
                    
        

In [26]:
# Run per product
c = PReviews("Canon_G3.txt")
c.preprocess()
c.features()
c.opinions()
c.sentiment()
c.feat_evaluation()
c.sent_evaluation()
# c.print_report()

45
Preprocessed 45 reviews in 0.81 seconds (spelling correction=False, stemming=False)
Looking at all product features together:
|	TP	|	FN	|	FP	|	Recall	|	Prec	|	F score	|
|	32	|	73	|	59	|	0.30	|	0.35	|	0.33	|

TP
canon, camera, picture, quality, picture quality, flash, feature, use, option, digital camera, software, lens, image, viewfinder, speed, canon g3, photo, lcd, design, focus, zoom, battery, shoot, lens cap, auto mode, battery life, color, shot, lag, raw image, image quality, strap

FN
canon powershot g3, dial, function, auto setting, photo quality, darn diopter adjustment dial, exposure control, metering option, spot metering, 4mp, size, weight, optical zoom, digital zoom, menu, button, control, lense, canera, print, manual mode, feel, four megapixel, product, night mode, lens cover, zooming lever, white balance, price, grain, flash photo, noise, g3, lag time, depth, external flash hot shoe, manual function, service, automode, raw format, shape, light auto correction, white of

# Manually parsing the corpora


In [19]:
def parse_tagged_reviews(path):
    with open(path, 'r') as f:
        reviews = []
        
        title = ""
        text = []
        for line in f.readlines():
            if line.startswith("*"): 
                # skip comment                
                continue
            elif line.startswith("[t]"):                 
                # title of new review
                if text: # but title can be empty                    
                    reviews.append(text)                
                text = [] # reset last review
                features = "" # reset last feature
                title = line[3:]
                # print("Title:", title)                
            elif line.startswith("##"): # sentence
                text.append(line[2:])
            elif not line.startswith("##") and "##" in line: #feature
                s = line.split("##")
                features = s[0]
                text.append(s[1])
        # append the last review
        reviews.append(text)
    return reviews


In [24]:
doc="Data/Customer_review_data/Canon G3.txt"
reviews = parse_tagged_reviews(doc)
reviews

[['i recently purchased the canon powershot g3 and am extremely satisfied with the purchase . \n',
  'the camera is very easy to use , in fact on a recent trip this past week i was asked to take a picture of a vacationing elderly group . \n',
  'after i took their picture with their camera , they offered to take a picture of us . \n',
  'i just told them , press halfway , wait for the box to turn green and press the rest of the way . \n',
  'they fired away and the picture turned out quite nicely . ( as all of my pictures have thusfar ) . \n',
  'a few of my work constituants owned the g2 and highly recommended the canon for picture quality . \n',
  "i 'm easily enlarging pictures to 8 1/2 x 11 with no visable loss in picture quality and not even using the best possible setting as yet ( super fine ) . \n",
  "ensure you get a larger flash , 128 or 256 , some are selling with the larger flash , 32mb will do in a pinch but you 'll quickly want a larger flash card as with any of the 4mp c