In [1]:
# Import Dependencies and modules
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from string import punctuation
from collections import Counter
from io import StringIO
from nltk.corpus import stopwords
import nltk
import glob
import errno
import os
import json

# Load Data

In [2]:
# Load each json file
with open('iphonex_digtrends.json') as f:
    iphonex_digtrends = json.load(f)

with open('iphonex_gizmodo.json') as f:
    iphonex_gizmodo = json.load(f)

with open('iphonex_techradar.json') as f:
    iphonex_techradar = json.load(f)

with open('S9_digtrends.json') as f:
    S9_digtrends = json.load(f)

with open('S9_gizmodo.json') as f:
    S9_gizmodo = json.load(f)

with open('S9_techradar.json') as f:
    S9_techradar = json.load(f)

# Clean Data

In [3]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!

    text = str(text).replace("\n", "")
    text = str(text).replace("\t", "")
    text = str(text).replace("\\n", "")
    text = str(text).replace("\\t", "")
    text = str(text).replace("\\", "")
    text = str(text).replace("xa0", " ")
    text = str(text).replace("\'", "")
    text = re.sub("<p>", "", str(text))
    text = re.sub("</p>", "", str(text))
    text = re.sub("</a>", "", str(text))
    text = re.sub('<[^>]+>', "", str(text))
    text = str(text).replace("\\u2019", "")
    text = str(text).replace("\\u2013", "")
    text = str(text).replace("\\u2018", "")
    text = str(text).replace("\\u00a0", "")
    text = str(text).replace("\\u00a3", "")
    text = str(text).replace("\u2014", "")
    text = str(text).replace("\u201d", "")
    text = str(text).replace("\u201c", "")
    return text


In [4]:
# Populate each JSON file into a data frame

iphonex_digtrends = pd.DataFrame.from_dict(iphonex_digtrends, orient='columns')
iphonex_gizmodo = pd.DataFrame.from_dict(iphonex_gizmodo, orient='columns')
iphonex_techradar = pd.DataFrame.from_dict(iphonex_techradar, orient='columns')
S9_digtrends = pd.DataFrame.from_dict(S9_digtrends, orient='columns')
S9_gizmodo = pd.DataFrame.from_dict(S9_gizmodo, orient='columns')
S9_techradar = pd.DataFrame.from_dict(S9_techradar, orient='columns')

In [5]:
# Define function to clean text
def clean_text(df):
    # Convert lists to strings and remove brackets
    df['text'] = df['text'].astype(str)
    df['author'] = df['author'].astype(str)

    df['text'] = df['text'].map(lambda x: x.strip('[]'))
    df['author'] = df['author'].map(lambda x: x.strip('[]'))

    # Clean text
    df['text'] = df['text'].apply(lambda x: text_cleaner(x))
    df['title'] = df['title'].apply(lambda x: text_cleaner(x))
    df['author'] = df['author'].apply(lambda x: text_cleaner(x))

    
# Put dataframes into a list to iterate through
dataframes = [iphonex_digtrends, iphonex_gizmodo, iphonex_techradar, S9_digtrends, S9_gizmodo, S9_techradar]

# Clean each Data Frame
for dataframe in dataframes:
    clean_text(dataframe)

In [6]:
# Label all the rows in the dataframe for the phone that the article is talking about

iphones = [iphonex_digtrends, iphonex_gizmodo, iphonex_techradar]
s9s = [S9_digtrends, S9_gizmodo, S9_techradar]

for dataframe in iphones:
    dataframe['phone'] = 'IPhone X'
    
for dataframe in s9s:
    dataframe['phone'] = 'Samsung Galaxy S9'


In [7]:
# Concat all the dataframes into one dataframe
all_frames = [iphonex_digtrends, iphonex_gizmodo, iphonex_techradar, S9_digtrends, S9_gizmodo, S9_techradar]
df = pd.concat(all_frames)


In [8]:
# Visualize dataframe
df.head()

Unnamed: 0,author,text,title,phone
0,Eric Brackett,The iPhone X launched to stellar reviews and e...,Shrinking demand forces Apple to slow down iPh...,IPhone X
1,Lucas Coll,"When it comes to high-quality devices, like th...",Looking to upgrade? These are the best iPhone ...,IPhone X
2,Simon Hill,The iPhone X is completely different from any ...,"The most common iPhone X problems, and how to ...",IPhone X
3,Trevor Mogg,"If you’re in the market for an iPhone X, and p...","This $4,600 solar charger comes with an iPhone...",IPhone X
4,Mark Jansen,", The initial estimates, set during the Novemb...",Apple will halve iPhone X production after lim...,IPhone X


# Pre-Process Data for NLP

In [9]:
# ### Processing Options for texts

# # Tokenize text
# df['text'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
# df['title'] = df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)

# # Remove Stopwords, or keep it, might be important for aspect based semantics
# stop = stopwords.words('english')
# df['text'] = df['text'].apply(lambda x: [item for item in x if item not in stop])
# df['title'] = df['title'].apply(lambda x: [item for item in x if item not in stop])

# # Lowercase everything
# df['text'] = df['text'].astype(str)
# df['text'] = df['text'].apply(lambda x: x.lower())

# df['title'] = df['title'].astype(str)
# df['title'] = df['title'].apply(lambda x: x.lower())

# # remove all punctuations
# df['text'] = df['text'].apply(lambda x: ''.join(c for c in x if c not in punctuation))
# df['title'] = df['title'].apply(lambda x: ''.join(c for c in x if c not in punctuation))

In [10]:
# # Download wordnet to find meaning of words, synonyms and antonyms
# nltk.download('wordnet')

In [11]:
# from nltk.corpus import wordnet as wn

# # Function to lemmatize and more words to their root
# def get_lemma(word):
#     lemma = wn.morphy(word)
#     if lemma is None:
#         return word
#     else:
#         return lemma
    
# # Compile set of stopwords
# nltk.download('stopwords')
# en_stop = set(nltk.corpus.stopwords.words('english'))

# def prepare_text_for_lda(text):
#     tokens = tokenize(text)
#     tokens = [token for token in tokens if len(token) > 4]
#     tokens = [token for token in tokens if token not in en_stop]
#     tokens = [get_lemma(token) for token in tokens]
#     return tokens

In [12]:
# ### Process Text Column

# text_data = []

# # Prepare training set for LDA
# tokens = df['text'].apply(lambda x: prepare_text_for_lda(x))

# # Prepare Dataframe for later
# df['text'] = df['text'].apply(lambda x: prepare_text_for_lda(x))

# # Append tokenized text to list of tokenized data
# null = tokens.apply(lambda x: text_data.append(x))

In [13]:
# ### Process Title Column

# # Tokenize text
# df['title'] = df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)

# # Remove Stopwords, or keep it, might be important for aspect based semantics
# stop = stopwords.words('english')
# df['title'] = df['title'].apply(lambda x: [item for item in x if item not in stop])

# # Lowercase everything
# df['title'] = df['title'].astype(str)
# df['title'] = df['title'].apply(lambda x: x.lower())

In [14]:
from spacy.lang.en import English

spacy.load('en')
parser = English()

# Function to tokenize text
def tokenize(text):
    lda_tokens = []
    tokens  = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [15]:
# Visualize dataframe
df.head()

Unnamed: 0,author,text,title,phone
0,Eric Brackett,The iPhone X launched to stellar reviews and e...,Shrinking demand forces Apple to slow down iPh...,IPhone X
1,Lucas Coll,"When it comes to high-quality devices, like th...",Looking to upgrade? These are the best iPhone ...,IPhone X
2,Simon Hill,The iPhone X is completely different from any ...,"The most common iPhone X problems, and how to ...",IPhone X
3,Trevor Mogg,"If you’re in the market for an iPhone X, and p...","This $4,600 solar charger comes with an iPhone...",IPhone X
4,Mark Jansen,", The initial estimates, set during the Novemb...",Apple will halve iPhone X production after lim...,IPhone X


In [16]:
# Load Opinion Lexicon

negatives = open('negative-words.txt', encoding = 'latin-1')
positives = open('positive-words.txt', encoding = 'latin-1')

# Read file
neg_unedit = [line.strip() for line in negatives.readlines()]
pos_unedit = [line.strip() for line in positives.readlines()]

# Extract only the list of words in the lexicon
neg = neg_unedit[31:]
pos = pos_unedit[30:]

# Compile opinion words
opinion_words = neg + pos

In [17]:
import en_core_web_md
from spacy import displacy
import gensim

# Load Neural Coreference to replace parse text and replace pronouns
nlp = en_core_web_md.load()

In [18]:
# Define feature_sentiment function
def feature_sentiment(sentence):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature did not exist previously,
              then updates sentiment to each of the new or existing features
    output: updated dictionary
    '''
    
    sent_dict = Counter()
    sentence = nlp(sentence)
    debug = 0
    for token in sentence:
    #    print(token.text,token.dep_, token.head, token.head.dep_)
        # check if the word is an opinion word, then assign sentiment
        if token.text in opinion_words:
            sentiment = 1 if token.text in pos else -1
            # if target is an adverb modifier (i.e. pretty, highly, etc.)
            # but happens to be an opinion word, ignore and pass
            if (token.dep_ == "advmod"):
                continue
            elif (token.dep_ == "amod"):
                sent_dict[token.head.text] += sentiment
            # for opinion words that are adjectives, adverbs, verbs...
            else:
                for child in token.children:
                    # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                    # This could be better updated for modifiers that either positively or negatively emphasize
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if child.dep_ == "neg":
                        sentiment *= -1
                for child in token.children:
                    # if verb, check if there's a direct object
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):                        
                        sent_dict[child.text] += sentiment
                        # check for conjugates (a AND b), then add both to dictionary
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj=1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] += sentiment

                # check for negation
                for child in token.head.children:
                    noun = ""
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"): 
                        sentiment *= -1
                
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                        noun = child.text
                        # Check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.text + " " + noun
                        sent_dict[noun] += sentiment
                    debug += 1
    return sent_dict

In [19]:
# Create list of texts

iphone_reviews = df[df['phone']=='IPhone X']
iphonex = list(iphone_reviews.text)

s9_reviews = df[df['phone']=='Samsung Galaxy S9']
s9 = list(s9_reviews.text)

# Join the list into one string of texts
iphonex = ' '.join(iphonex)
s9 = ' '.join(s9)

iphonex_aspect_sentiment_scores = dict(feature_sentiment(iphonex))
s9_ascpect_sentiment_scores = dict(feature_sentiment(s9))

In [20]:
# Create sorted dataframe of phone aspects and their sentiment scores
iphonex = pd.DataFrame.from_dict(iphonex_aspect_sentiment_scores, orient='index')
iphonex['score'] = iphonex[0]
iphonex['aspects'] = iphonex.index
iphonex = iphonex.sort_values(by=['score'], ascending=False)
iphonex = iphonex.reset_index()
iphonex = iphonex.drop([0, 'index'], axis=1)
iphonex = iphonex.reindex(sorted(iphonex.columns), axis=1)

s9 = pd.DataFrame.from_dict(s9_ascpect_sentiment_scores, orient='index')
s9['score'] = s9[0]
s9['aspects'] = s9.index
s9 = s9.sort_values(by=['score'], ascending=False)
s9 = s9.reset_index()
s9 = s9.drop([0, 'index'], axis=1)
s9 = s9.reindex(sorted(s9.columns), axis=1)

In [21]:
# Separate dataframes from positive and negative aspects for each phones

# IPhoneX
iphonex_negative = iphonex[iphonex['score']<0]
iphonex_negative['negative aspects'] = iphonex_negative['aspects']
iphonex_negative = iphonex_negative.sort_values(by=['score'], ascending=True)
iphonex_negative = iphonex_negative.reset_index()
iphonex_negative = iphonex_negative.drop(['index', 'aspects'], axis=1)
iphonex_negative = iphonex_negative.reindex(sorted(iphonex_negative), axis=1)

iphonex_positive = iphonex[iphonex['score']>0]
iphonex_positive['negative aspects'] = iphonex_positive['aspects']
iphonex_positive = iphonex_positive.reset_index()
iphonex_positive = iphonex_positive.drop(['index', 'aspects'], axis=1)
iphonex_positive = iphonex_positive.reindex(sorted(iphonex_positive), axis=1)


# s9
s9_negative = s9[s9['score']<0]
s9_negative['negative aspects'] = s9_negative['aspects']
s9_negative = s9_negative.sort_values(by=['score'], ascending=True)
s9_negative = s9_negative.reset_index()
s9_negative = s9_negative.drop(['index', 'aspects'], axis=1)
s9_negative = s9_negative.reindex(sorted(s9_negative), axis=1)

s9_positive = s9[s9['score']>0]
s9_positive['negative aspects'] = s9_positive['aspects']
s9_positive = s9_positive.reset_index()
s9_positive = s9_positive.drop(['index', 'aspects'], axis=1)
s9_positive = s9_positive.reindex(sorted(s9_positive), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-vie

In [22]:
# USE
    ## Peter Min's review_pipe function with only the argument for the aspect terms
    ## Testing of review_pipe function to see sentiment scores

In [23]:
### MAY NEED TO SCRAPE TEST DATA. 