<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import spacy
#import libraries
import pandas as pd
nlp = spacy.load("en_core_web_sm")
from spacy.language import Language

import gensim
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment.util import *

In [2]:
df1 = pd.read_csv('../datasets/tripadvisor_mbs_210328_2334')
df2 = pd.read_csv('../datasets/tripadvisor_mbs_210329_0953')
df3 = pd.read_csv('../datasets/tripadvisor_mbs_210329_1218')
df4 = pd.read_csv('../datasets/tripadvisor_mbs_210329_1342')

In [3]:
df1 = df1.rename(columns={'Unnamed: 0': 'rev_id'})
df2 = df2.rename(columns={'Unnamed: 0': 'rev_id'})
df3 = df3.rename(columns={'Unnamed: 0': 'rev_id'})
df4 = df4.rename(columns={'Unnamed: 0': 'rev_id'})

In [4]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [5]:
df.duplicated().sum()

1

In [19]:
df.drop_duplicates(inplace=True)

In [12]:
df

Unnamed: 0,rev_id,property,rev_source,rev_date,rev_location,rev_title,rev_content,rev_score,rev_visit_date
0,691636950,mbs,tripadvisor,2019-07-01,Australia,Disappointing,We splurged on our last night in Singapore to ...,6.0,2019-07-01
1,691610644,mbs,tripadvisor,2019-07-01,,High class shopping and dining - The Shopee,Located opposite of one iconic building in Sin...,8.0,2019-07-01
2,691452668,mbs,tripadvisor,2019-07-01,United Kingdom,Amazing!!!,We stayed for just one night in July 2019 and ...,10.0,2019-07-01
3,691335096,mbs,tripadvisor,2019-07-01,Belgium,Iconic but too massive for me,Clearly the building is a Singapore landmark. ...,6.0,2019-07-01
4,691325714,mbs,tripadvisor,2019-07-01,United Kingdom,A mazing,Stayed for 3 nights. Everything about the hote...,10.0,2019-07-01
...,...,...,...,...,...,...,...,...,...
4995,391583435,mbs,tripadvisor,2016-07-01,Romania,Nice visit,A short visit to Marina Bay on out way to Bali...,8.0,2016-07-01
4996,391560184,mbs,tripadvisor,2016-07-01,Australia,A Massive Surprise,We recently spent 5 nights (family of 5) at th...,10.0,2016-07-01
4997,391552729,mbs,tripadvisor,2016-07-01,Australia,"Great infinity pool, large scale hotel",Stayed for two night when we passed through Si...,6.0,2016-07-01
4998,391522920,mbs,tripadvisor,2016-07-01,Venezuela,Must go,"This hotel is just AMAZING, you have to stay h...",10.0,2016-07-01


We will now break our review content into sentences. The sentences will also be broken at the word 'but' due to content before and after 'but' being of contrasting opinion. We will do this by adding a modification to SpaCy's sentencizer method.

In [16]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "but":
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe("set_custom_boundaries", before="parser")

<function __main__.set_custom_boundaries(doc)>

In [None]:
def decontracted(phrase):
    '''
    Function removes contractions from strings and replaces them with their un-contracted form to assist in tokenization.
    Returns a string.
    '''
    phrase = re.sub(r"won't", "will not", phrase)      # replace won't with "will not"
    phrase = re.sub(r"can\'t", "can not", phrase)      # replace can or cant with 'can not'
    phrase = re.sub(r"n\'t", " not", phrase)           # replece n with 'not'
    phrase = re.sub(r"\'re", " are", phrase)           # replace re with 'are'
    phrase = re.sub(r"\'s", " is", phrase)             # replace s with 'is'
    phrase = re.sub(r"\'d", " would", phrase)          # replace 'd' with 'would'
    phrase = re.sub(r"\'ll", " will", phrase)          # replace 'll with 'will'
    phrase = re.sub(r"\'t", " not", phrase)            # replace 't' with 'not'
    phrase = re.sub(r"\'ve", " have", phrase)          # replace ve with 'have'
    phrase = re.sub(r"\'m", " am", phrase)             # replace 'm with 'am'
    return phrase


In [None]:
def update_df_pp(raw_table):
    '''
    Function updates df_pp with preprocessed data from raw tables
    returns modified df
    '''
    pp_columns = ['rev_id', # review id from raw_table
                  'sent_text', # sentence_text from spaCy .sents
#                   'sent_sw', # sentence with stopwords removed by spaCy
                  'objects', # list of found dobj and pobj and nsubj
#                   'contains_staff_terms', # contains words from list of words commonly associated with staff, indicating its talking about service
#                   'contains_names', # contains names, indicating that its probably talking about service
                  'contains_not_have', # contains the words no or not have, indicating possible indication of absence
                  'descriptive', # adjectives and adverbs -- indicating sentiment
                  'vader_neg', # vader sentiment negative score
                  'vader_neu', # vader sentiment neutral scores
                  'vader_pos', # vader sentiment positive score
                  'vader_comp', # vader compound score
                  'category', # category -- will be manually entered for training set
                  'sentiment', # overall sentence sentiment
                 ]
    df_pp = pd.DataFrame(columns = pp_columns)
    pp_index = 0
    
#     Index(['rev_id', 'property', 'rev_source', 'rev_date', 'rev_location',
#        'rev_title', 'rev_content', 'rev_score', 'rev_visit_date'],
#       dtype='object')
    
    for index, row in raw_table.iterrows(): # iterate through rows in raw_table
        # insert rev_id
        review_id = row['rev_id']
   
        doc = nlp(row['rev_content'])
        #iterate through sentences
        for sentence in doc.sents:
            # insert review id
            df_pp.loc[pp_index, 'rev_id'] = review_id
            
            # insert sentence
            df_pp.loc[pp_index, 'sent_text'] = sentence.text
                        
            sentence_mod = decontracted(sentence.text)
            # insert sentence with stop words removed
            
            # does sentence contain a not have negation?
            df_pp.loc[pp_index, 'contains_not_have'] = contains_not_have(sentence_mod)
            
            descriptive_terms = []
            target = []
            contains_names = False
            contains_staff_terms = False

            
            for token in sentence:
                 # get objects for reference
                if token.dep_ == 'dobj' or token.dep_ == 'pobj' or token.dep_ == 'nsubj':
                    target.append(token.text)
             
                # get descriptive terms for reference
                if token.pos_ == 'ADJ':
                    prepend = ''
                    for child in token.children:
                        if child.pos_ != 'ADV':
                              continue
                        prepend += child.text + ' '
                    descriptive_terms.append(prepend + token.text)
                        
            df_pp.loc[pp_index, 'objects'] = ", ".join(target)
            df_pp.loc[pp_index, 'descriptive'] = ", ".join(descriptive_terms)
            
            #vader sentiment analysis -- bootlegging of sentiment analysis
            sid = SentimentIntensityAnalyzer()
            ss = sid.polarity_scores(sentence.text)
            df_pp.loc[pp_index, 'vader_neg'] = ss['neg']
            df_pp.loc[pp_index, 'vader_neu'] = ss['neu']
            df_pp.loc[pp_index, 'vader_pos'] = ss['pos']
            df_pp.loc[pp_index, 'vader_comp'] = ss['compound']
            
            # overall sentiment started as vader_comp rounded up or down
            if ss['compound'] > 0.1:
                df_pp.loc[pp_index, 'sentiment'] = 1
            elif ss['compound'] = 0:
                df_pp.loc[pp_index, 'sentiment'] = 0
            elif ss['compound'] < 0:
                df_pp.loc[pp_index, 'sentiment'] = -1
            
            # enter new row for new sentence
            pp_index += 1
    # return full df_pp
    return df_pp

In [None]:
test_df = update_df_pp(df[0:1000])

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
test_df['batch_date'] = '210415'

In [None]:
test_df['contains_not_have'].value_counts()

In [None]:
test_df.to_csv('../testdata/test_df_210415.csv')