## FN1: Read File

In [1]:
def nl_read_file(file_location):
    import pandas as pd
    import re
    if(file_location.endswith('csv')):
        rev_data = pd.read_csv( file_location , engine = 'python')
    elif (file_location.endswith('tsv')):
        rev_data = pd.read_csv( file_location , engine = 'python' ,sep = '\t')
    str_list = []
    for item in rev_data['Review']:
        item = item.lower()
        item= re.sub(r'[^\w\s\']','',item)
        str_list.append(item)
    return str_list

## FN2: Tokenizer

In [2]:
def nl_tokenize(str_list):
    import nltk
    from nltk.corpus import stopwords
    
    filtered_list = []
    filtered_tokens = []
    stop_words=set(stopwords.words("english"))
    for strings in str_list:
        temp_str = nltk.tokenize.word_tokenize(strings)
        temp_tokens = [token for token in temp_str if token not in stop_words]
        filtered_list.append( temp_tokens )
        filtered_tokens = filtered_tokens + temp_tokens
        
    return  {'filtered_list': filtered_list, 'filtered_tokens' : filtered_tokens}

## FN3 Pre-Processing

In [3]:
def nl_pre_process( unprocessed_data ):
    
    (filtered_list , filtered_tokens) = (unprocessed_data['filtered_list'] , unprocessed_data['filtered_tokens'])
    
    import nltk
    from nltk.corpus import stopwords
    from nltk.tag import StanfordPOSTagger
    from nltk.corpus import wordnet
    from nltk.stem.wordnet import WordNetLemmatizer
    
    stanford_dir = "D:\EnvVariables\stanford-postagger-full-2018-10-16"
    modelfile = stanford_dir+"\models\english-bidirectional-distsim.tagger"
    jarfile=stanford_dir+"\stanford-postagger.jar"
    
    tag_dictionary = {"J": wordnet.ADJ,
                      "N": wordnet.NOUN,
                      "V": wordnet.VERB,
                      "R": wordnet.ADV}
    def get_wordnet_pos(tag):
        if tag in tag_dictionary.keys():
            return tag_dictionary[tag]
        else:
            return tag_dictionary['N'] 

    tagger=StanfordPOSTagger( model_filename=modelfile, path_to_jar=jarfile )
    tagged_words = tagger.tag( filtered_tokens )
    
    lem = WordNetLemmatizer()

    lemmatized_words = []

    for w in tagged_words:
        x =  lem.lemmatize(w[0],get_wordnet_pos(w[1][0]))
        lemmatized_words.append((x,w[1]))

    tag_list = []
    index = 0
    for lists in filtered_list:
        length_l = len(lists) + index - 1
        temp = []
        for i in range(index,length_l + 1):
            temp.append(lemmatized_words[i])
        tag_list.append(temp)
        index = length_l + 1
        
        
    return { 'lemmatized_words' : lemmatized_words ,'tag_list' : tag_list }

## FN4: Feature Selector

In [4]:
def nl_feature_filter (prc_data , number_of_features):
    all_nouns = []
    for tuples in prc_data['lemmatized_words']:
        if tuples[1].startswith('NN'):
            all_nouns.append(tuples[0])
    features = []
    from nltk.probability import FreqDist
    fdist = FreqDist(all_nouns)
    for item in fdist.most_common(number_of_features):
        features.append(item[0])
    
    return features

## FN5: Orientation Word

In [5]:
def nl_orientation(input_word, pos_type):
    from nltk.corpus import wordnet
    from nltk.corpus import sentiwordnet
    
    word_synset = wordnet.synsets(input_word)
    orientation_score = 0
    if(len(word_synset) != 0):
        for element in word_synset:
            word=element.name()
            orientation = sentiwordnet.senti_synset(word)
            orientation_score = orientation_score + (orientation.pos_score() - orientation.neg_score())
    return orientation_score

## FN6: Orientation Group Of Words

In [6]:
def nl_orientation_phrase(input_words):
    negative_words = ['not','isn\'t','don\'t','n\'t']
    negator,orient,orient_exact = 0,0,0
    
    for words in input_words:
        word = words[0]
        pos_type = words[0][1]
        if word in negative_words:
            negator = 1
        else:
            temp_val = nl_orientation(word , pos_type.lower())
            orient_exact = orient_exact + temp_val
            if temp_val > 0:
                orient = orient + 1
            elif temp_val < 0:
                orient = orient - 1
        
    return_value = 0          
    if orient > 0:
        return_value = 1
    elif orient < 0:
        return_value = -1
    else:
        if orient_exact > 0:
            return_value = 1
        elif orient_exact < 0:
            return_value = -1
        else:
            return_value = 0
    
    if negator == 0:
        return return_value
    else:
        return return_value * -1
        

## FN7: Review Classifier

In [7]:
def nl_review_classifier( str_list, unprc_data, prc_data, features ):

    from nltk.corpus import wordnet

    filtered_list =  unprc_data['filtered_list']
    tag_list      =  prc_data['tag_list']


    opinion_dictionary = {}
    opinion_dictionary_tagged = {}
    feature_summary= {}

    for feature in features:
        opinion_dictionary[feature] = {}
        opinion_dictionary_tagged[feature] = []
        feature_summary[feature] = {}
        feature_summary[feature]['total_count'] = 0
        feature_summary[feature]['positive_count'] = 0
        feature_summary[feature]['negative_count'] = 0
        feature_summary[feature]['positive_reviews'] = []
        feature_summary[feature]['negative_reviews'] = []
        feature_summary[feature]['positive_reviews_tokens'] = []
        feature_summary[feature]['negative_reviews_tokens'] = []

    for i in range(len(filtered_list)):
        common_feature = list(set(filtered_list[i]).intersection(features))
        for ele in common_feature:

            index = filtered_list[i].index(ele)
            temp_str = ''
            temp_tuple = []
            feature_summary[ele]['total_count'] = feature_summary[ele]['total_count'] + 1

            for j in range(index-2, index+3):
                if j >= 0  and j < len(tag_list[i]):
                    if tag_list[i][j][1].startswith(('JJ','RB','VB')):
                        temp_str = temp_str + tag_list[i][j][0] + ' '
                        temp_tuple.append(tag_list[i][j])

            if temp_str != '':
                temp_orient = nl_orientation_phrase(temp_tuple)
                opinion_dictionary[ele][temp_str] = temp_orient

                if temp_orient == 1:
                    feature_summary[ele]['positive_count'] = feature_summary[ele]['positive_count'] + 1
                    feature_summary[ele]['positive_reviews'].append(str_list[i])
                    feature_summary[ele]['positive_reviews_tokens'].append(temp_tuple)

                if temp_orient == -1:
                    feature_summary[ele]['negative_count'] = feature_summary[ele]['negative_count'] + 1
                    feature_summary[ele]['negative_reviews'].append(str_list[i])
                    feature_summary[ele]['negative_reviews_tokens'].append(temp_tuple)

                opinion_dictionary_tagged[ele].append(temp_tuple)

    final_data = {'opinion_dictionary' : opinion_dictionary ,'opinion_dictionary_tagged' : opinion_dictionary_tagged 
                      ,'feature_summary' : feature_summary }
    return final_data

## FN8: Statistical Data Print

In [8]:
def stats_calc(data):
    feature_summary = data['feature_summary']
    for item in feature_summary:
        print('\nFEATURE : ',item)
        if feature_summary[item]['total_count'] != 0:
            pos_per = feature_summary[item]['positive_count']/feature_summary[item]['total_count']
            neg_per = feature_summary[item]['negative_count']/feature_summary[item]['total_count']
            miss_per = 1 - (pos_per + neg_per)
            print('    Total Reviews       : ', feature_summary[item]['total_count'])
            print('    Positive Reviews    : ', '%.3f'%pos_per ,'%')
            print('    Negative Reviews    : ', '%.3f'%neg_per ,'%')
            print('    Unclassified Reviews: ', '%.3f'%miss_per,'%')
        else:
            print('    Error in Feature Finding ')

## FN9: Function Executer

In [9]:
def review_classification(input):
    string_data         = nl_read_file(input)
    tokenized_data      = nl_tokenize(string_data)
    processed_data      = nl_pre_process(tokenized_data)
    selected_feature    = nl_feature_filter(processed_data,50)
    result              = nl_review_classifier( string_data, tokenized_data , processed_data ,selected_feature )
    return result

## Driver Code

In [13]:
input_add = "C:/Reviews/2amznokia.csv"
review = review_classification(input_add)

In [15]:
string_data         = nl_read_file(input_add)
tokenized_data      = nl_tokenize(string_data)
processed_data      = nl_pre_process(tokenized_data)

In [16]:
processed_data

{'lemmatized_words': [('build', 'VB'),
  ('quality', 'NN'),
  ('premium', 'NN'),
  ('look', 'VBG'),
  ('compact', 'JJ'),
  ('design', 'NN'),
  ('fingerprint', 'NN'),
  ('sensor', 'NN'),
  ('fast', 'RB'),
  ('responsive', 'JJ'),
  ('camera', 'NN'),
  ('performance', 'NN'),
  ('decent', 'JJ'),
  ('664', 'CD'),
  ('gb', 'NN'),
  ('variant', 'NN'),
  ('already', 'RB'),
  ('update', 'VBN'),
  ('android', 'NN'),
  ('pie', 'NN'),
  ('9', 'CD'),
  ('nice', 'JJ'),
  ('experience', 'NN'),
  ('ðÿðÿðÿðÿðÿðÿ', 'NN'),
  ('camera', 'NN'),
  ('mark', 'NN'),
  ('worry', 'VBP'),
  ('happy', 'JJ'),
  ('purchase', 'NN'),
  ('good', 'JJ'),
  ('brand', 'NN'),
  ('quality', 'NN'),
  ('good', 'JJ'),
  ('product', 'NN'),
  ('use', 'VBD'),
  ('almost', 'RB'),
  ('every', 'DT'),
  ('brand', 'NN'),
  ('last', 'JJ'),
  ('4', 'CD'),
  ('5', 'CD'),
  ('year', 'NNS'),
  ('last', 'JJ'),
  ('feel', 'NN'),
  ("'m", 'VBP'),
  ('back', 'JJ'),
  ('home', 'NN'),
  ('long', 'JJ'),
  ('journey', 'NN'),
  ('best', 'RB'),
  ('b

In [11]:
stats_calc(review)


FEATURE :  phone
    Total Reviews       :  123
    Positive Reviews    :  0.691 %
    Negative Reviews    :  0.138 %
    Unclassified Reviews:  0.171 %

FEATURE :  camera
    Total Reviews       :  66
    Positive Reviews    :  0.545 %
    Negative Reviews    :  0.258 %
    Unclassified Reviews:  0.197 %

FEATURE :  day
    Total Reviews       :  32
    Positive Reviews    :  0.531 %
    Negative Reviews    :  0.312 %
    Unclassified Reviews:  0.156 %

FEATURE :  nokia
    Total Reviews       :  59
    Positive Reviews    :  0.593 %
    Negative Reviews    :  0.102 %
    Unclassified Reviews:  0.305 %

FEATURE :  battery
    Total Reviews       :  50
    Positive Reviews    :  0.420 %
    Negative Reviews    :  0.400 %
    Unclassified Reviews:  0.180 %

FEATURE :  quality
    Total Reviews       :  47
    Positive Reviews    :  0.660 %
    Negative Reviews    :  0.213 %
    Unclassified Reviews:  0.128 %

FEATURE :  issue
    Total Reviews       :  22
    Positive Reviews    :  0.6

In [12]:
import json
out_file_name = input_add.split('.')[0] + '_summary.json'
with open(out_file_name, 'w') as json_file:
  json.dump(review['feature_summary'], json_file)

out_file_name = input_add.split('.')[0] + '_opinion_dictionary.json'
with open(out_file_name, 'w') as json_file:
  json.dump(review['opinion_dictionary'], json_file)

out_file_name = input_add.split('.')[0] + '_opinion_dictionary_tagged.json'
with open(out_file_name, 'w') as json_file:
  json.dump(review['opinion_dictionary_tagged'], json_file)