In [1]:
# Import Dependencies and modules
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from string import punctuation
from collections import Counter
from io import StringIO
from nltk.corpus import stopwords
import nltk
import glob
import errno
import os
import json

# Load Data

In [2]:
# Load each json file
with open('iphonex_digtrends.json') as f:
    iphonex_digtrends = json.load(f)

with open('iphonex_gizmodo.json') as f:
    iphonex_gizmodo = json.load(f)

with open('iphonex_techradar.json') as f:
    iphonex_techradar = json.load(f)

with open('S9_digtrends.json') as f:
    S9_digtrends = json.load(f)

with open('S9_gizmodo.json') as f:
    S9_gizmodo = json.load(f)

with open('S9_techradar.json') as f:
    S9_techradar = json.load(f)

# Clean Data

In [3]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!

    text = str(text).replace("\n", "")
    text = str(text).replace("\t", "")
    text = str(text).replace("\\n", "")
    text = str(text).replace("\\t", "")
    text = str(text).replace("\\", "")
    text = str(text).replace("xa0", " ")
    text = str(text).replace("\'", "")
    text = re.sub("<p>", "", str(text))
    text = re.sub("</p>", "", str(text))
    text = re.sub("</a>", "", str(text))
    text = re.sub('<[^>]+>', "", str(text))
    text = str(text).replace("\\u2019", "")
    text = str(text).replace("\\u2013", "")
    text = str(text).replace("\\u2018", "")
    text = str(text).replace("\\u00a0", "")
    text = str(text).replace("\\u00a3", "")
    text = str(text).replace("\u2014", "")
    text = str(text).replace("\u201d", "")
    text = str(text).replace("\u201c", "")
    return text


In [4]:
# Populate each JSON file into a data frame

iphonex_digtrends = pd.DataFrame.from_dict(iphonex_digtrends, orient='columns')
iphonex_gizmodo = pd.DataFrame.from_dict(iphonex_gizmodo, orient='columns')
iphonex_techradar = pd.DataFrame.from_dict(iphonex_techradar, orient='columns')
S9_digtrends = pd.DataFrame.from_dict(S9_digtrends, orient='columns')
S9_gizmodo = pd.DataFrame.from_dict(S9_gizmodo, orient='columns')
S9_techradar = pd.DataFrame.from_dict(S9_techradar, orient='columns')

In [5]:
# Define function to clean text
def clean_text(df):
    # Convert lists to strings and remove brackets
    df['text'] = df['text'].astype(str)
    df['author'] = df['author'].astype(str)

    df['text'] = df['text'].map(lambda x: x.strip('[]'))
    df['author'] = df['author'].map(lambda x: x.strip('[]'))

    # Clean text
    df['text'] = df['text'].apply(lambda x: text_cleaner(x))
    df['title'] = df['title'].apply(lambda x: text_cleaner(x))
    df['author'] = df['author'].apply(lambda x: text_cleaner(x))

    
# Put dataframes into a list to iterate through
dataframes = [iphonex_digtrends, iphonex_gizmodo, iphonex_techradar, S9_digtrends, S9_gizmodo, S9_techradar]

# Clean each Data Frame
for dataframe in dataframes:
    clean_text(dataframe)

In [6]:
# Label all the rows in the dataframe for the phone that the article is talking about

iphones = [iphonex_digtrends, iphonex_gizmodo, iphonex_techradar]
s9s = [S9_digtrends, S9_gizmodo, S9_techradar]

for dataframe in iphones:
    dataframe['phone'] = 'IPhone X'
    
for dataframe in s9s:
    dataframe['phone'] = 'Samsung Galaxy S9'


In [7]:
# Concat all the dataframes into one dataframe
all_frames = [iphonex_digtrends, iphonex_gizmodo, iphonex_techradar, S9_digtrends, S9_gizmodo, S9_techradar]
df = pd.concat(all_frames)


In [8]:
# Visualize dataframe
df.head()

Unnamed: 0,author,text,title,phone
0,Eric Brackett,The iPhone X launched to stellar reviews and e...,Shrinking demand forces Apple to slow down iPh...,IPhone X
1,Lucas Coll,"When it comes to high-quality devices, like th...",Looking to upgrade? These are the best iPhone ...,IPhone X
2,Simon Hill,The iPhone X is completely different from any ...,"The most common iPhone X problems, and how to ...",IPhone X
3,Trevor Mogg,"If you’re in the market for an iPhone X, and p...","This $4,600 solar charger comes with an iPhone...",IPhone X
4,Mark Jansen,", The initial estimates, set during the Novemb...",Apple will halve iPhone X production after lim...,IPhone X


# Pre-Process Data for NLP

In [9]:
# # Tokenize text
# df['text'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
# df['title'] = df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)

# # Remove Stopwords, or keep it, might be important for aspect based semantics
# stop = stopwords.words('english')
# df['text'] = df['text'].apply(lambda x: [item for item in x if item not in stop])
# df['title'] = df['title'].apply(lambda x: [item for item in x if item not in stop])

# # Lowercase everything
# df['text'] = df['text'].astype(str)
# df['text'] = df['text'].apply(lambda x: x.lower())

# df['title'] = df['title'].astype(str)
# df['title'] = df['title'].apply(lambda x: x.lower())

# # remove all punctuations
# df['text'] = df['text'].apply(lambda x: ''.join(c for c in x if c not in punctuation))
# df['title'] = df['title'].apply(lambda x: ''.join(c for c in x if c not in punctuation))

In [10]:
from spacy.lang.en import English

spacy.load('en')
parser = English()

# Function to tokenize text
def tokenize(text):
    lda_tokens = []
    tokens  = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [11]:
# Download wordnet to find meaning of words, synonyms and antonyms
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayankarim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
from nltk.corpus import wordnet as wn

# Function to lemmatize and more words to their root
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
# Compile set of stopwords
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayankarim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
text_data = []

# Prepare training set for LDA
tokens = df['text'].apply(lambda x: prepare_text_for_lda(x))

# Prepare Dataframe for later
df['text'] = df['text'].apply(lambda x: prepare_text_for_lda(x))

# Append tokenized text to list of tokenized data
null = tokens.apply(lambda x: text_data.append(x))

In [14]:
### Clean Title Column

# Tokenize text
df['title'] = df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)

# Remove Stopwords, or keep it, might be important for aspect based semantics
stop = stopwords.words('english')
df['title'] = df['title'].apply(lambda x: [item for item in x if item not in stop])

# Lowercase everything
df['title'] = df['title'].astype(str)
df['title'] = df['title'].apply(lambda x: x.lower())

In [15]:
# Visualize dataframe
df.head()

Unnamed: 0,author,text,title,phone
0,Eric Brackett,"[iphone, launch, stellar, review, equally, str...","['shrinking', 'demand', 'forces', 'apple', 'sl...",IPhone X
1,Lucas Coll,"[come, quality, devices, macbook, homepod, app...","['looking', 'upgrade', '?', 'these', 'best', '...",IPhone X
2,Simon Hill,"[iphone, completely, different, predecessor, f...","['the', 'common', 'iphone', 'x', 'problems', '...",IPhone X
3,Trevor Mogg,"[market, iphone, pricey, battery, pack, happen...","['this', '$', '4,600', 'solar', 'charger', 'co...",IPhone X
4,Mark Jansen,"[initial, estimate, november, launch, window, ...","['apple', 'halve', 'iphone', 'x', 'production'...",IPhone X


In [16]:
from gensim import corpora

# Assemble tokenized text data into a dictionary
dictionary = corpora.Dictionary(text_data)

In [17]:
# Create Bag of Words corpus from text data
corpus = [dictionary.doc2bow(text) for text in text_data]

In [18]:
from gensim import models

# Create TF-IDF vectors from our bag of words
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.03679786889970594),
 (1, 0.07617342592283681),
 (2, 0.1325071123169123),
 (3, 0.05375599827065056),
 (4, 0.14142825528319036),
 (5, 0.08927882787942588),
 (6, 0.08321168552940078),
 (7, 0.11008968466472606),
 (8, 0.031338570618464304),
 (9, 0.18744659011894513),
 (10, 0.11008968466472606),
 (11, 0.08321168552940078),
 (12, 0.1249643934126301),
 (13, 0.04552406733537042),
 (14, 0.11903843717256786),
 (15, 0.05375599827065056),
 (16, 0.16642337105880156),
 (17, 0.06248219670631505),
 (18, 0.016420406425483286),
 (19, 0.22017936932945212),
 (20, 0.16900105918222652),
 (21, 0.09313155529378143),
 (22, 0.016958129370944625),
 (23, 0.0563336863940755),
 (24, 0.11008968466472606),
 (25, 0.11008968466472606),
 (26, 0.08321168552940078),
 (27, 0.08321168552940078),
 (28, 0.15234685184567362),
 (29, 0.031338570618464304),
 (30, 0.10430313081651559),
 (31, 0.0563336863940755),
 (32, 0.09313155529378143),
 (33, 0.08321168552940078),
 (34, 0.02321561718696235),
 (35, 0.09313155529378143),
 (

In [19]:
import pickle

pickle.dump(corpus_tfidf, open('corpus_tfidf.pkl', 'wb'))
dictionary.save('dictionary.gensim')

# 3 Topics

In [20]:
import gensim

NUM_TOPICS = 3

# Extract Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics = NUM_TOPICS, id2word = dictionary, passes=50)
ldamodel.save('model3.gensim')

# Print terms for topics
topics = ldamodel.print_topics(num_words=7)
for topic in topics:
    print(topic)

(0, '0.001*"tesla" + 0.000*"caviar" + 0.000*"solar" + 0.000*"4,600" + 0.000*"ruble" + 0.000*"knocking" + 0.000*"299,000"')
(1, '0.000*"reuters" + 0.000*"hack" + 0.000*"employer" + 0.000*"conference" + 0.000*"researcher" + 0.000*"withdraw" + 0.000*"corephotonics"')
(2, '0.002*"iphone" + 0.002*"galaxy" + 0.002*"samsung" + 0.002*"apple" + 0.001*"battery" + 0.001*"camera" + 0.001*"screen"')


# 4 Topics

In [21]:
import gensim

NUM_TOPICS = 4

# Extract Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics = NUM_TOPICS, id2word = dictionary, passes=50)
ldamodel.save('model4.gensim')

# Print terms for topics
topics = ldamodel.print_topics(num_words=7)
for topic in topics:
    print(topic)

(0, '0.000*"court" + 0.000*"injunction" + 0.000*"legal" + 0.000*"license" + 0.000*"illegal" + 0.000*"ossia" + 0.000*"infringe"')
(1, '0.001*"woman" + 0.001*"pocket" + 0.001*"hillard" + 0.001*"pants" + 0.000*"idrop" + 0.000*"explode" + 0.000*"quarter"')
(2, '0.003*"iphone" + 0.002*"galaxy" + 0.002*"samsung" + 0.002*"apple" + 0.002*"battery" + 0.001*"camera" + 0.001*"screen"')
(3, '0.000*"microsoft" + 0.000*"ubreakifix" + 0.000*"warranty" + 0.000*"cursor" + 0.000*"impair" + 0.000*"repair" + 0.000*"accessibility"')


# 5 Topics

In [22]:
import gensim

NUM_TOPICS = 5

# Extract Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics = NUM_TOPICS, id2word = dictionary, passes=50)
ldamodel.save('model5.gensim')

# Print terms for topics
topics = ldamodel.print_topics(num_words=7)
for topic in topics:
    print(topic)

(0, '0.000*"protector" + 0.000*"leather" + 0.000*"armor" + 0.000*"protection" + 0.000*"disable" + 0.000*"protect" + 0.000*"scratch"')
(1, '0.001*"woman" + 0.001*"ossia" + 0.001*"pocket" + 0.001*"pants" + 0.001*"hillard" + 0.001*"idrop" + 0.000*"explode"')
(2, '0.001*"microsoft" + 0.001*"reuters" + 0.000*"hack" + 0.000*"employer" + 0.000*"researcher" + 0.000*"withdraw" + 0.000*"impair"')
(3, '0.001*"browser" + 0.001*"reboot" + 0.001*"driver" + 0.001*"shader" + 0.000*"graphicsfuzz" + 0.000*"deploy" + 0.000*"reproducible"')
(4, '0.003*"iphone" + 0.002*"galaxy" + 0.002*"samsung" + 0.002*"apple" + 0.002*"battery" + 0.002*"camera" + 0.001*"screen"')


# 7 Topics

In [23]:
import gensim

NUM_TOPICS = 7

# Extract Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics = NUM_TOPICS, id2word = dictionary, passes=50)
ldamodel.save('model7.gensim')

# Print terms for topics
topics = ldamodel.print_topics(num_words=7)
for topic in topics:
    print(topic)

(0, '0.001*"apparent" + 0.001*"tier" + 0.001*"brother" + 0.001*"bragging" + 0.001*"blimp" + 0.001*"dusty" + 0.001*"lucky"')
(1, '0.002*"quarter" + 0.001*"repair" + 0.001*"analytics" + 0.001*"corephotonics" + 0.001*"disable" + 0.001*"13-inch" + 0.001*"profit"')
(2, '0.002*"microsoft" + 0.001*"reboot" + 0.001*"browser" + 0.001*"woman" + 0.001*"pocket" + 0.001*"hillard" + 0.001*"pants"')
(3, '0.004*"iphone" + 0.003*"galaxy" + 0.003*"samsung" + 0.002*"apple" + 0.002*"battery" + 0.002*"camera" + 0.002*"screen"')
(4, '0.001*"tesla" + 0.001*"caviar" + 0.001*"solar" + 0.001*"commendable" + 0.001*"positive" + 0.001*"swift" + 0.001*"unveiling"')
(5, '0.001*"repair" + 0.001*"ubreakifix" + 0.001*"warranty" + 0.001*"authorize" + 0.000*"lawder" + 0.000*"location" + 0.000*"partner"')
(6, '0.001*"protector" + 0.001*"reuters" + 0.001*"leather" + 0.001*"armor" + 0.001*"employer" + 0.001*"hack" + 0.000*"beneath"')


# 10 Topics

In [24]:
import gensim

NUM_TOPICS = 10

# Extract Topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics = NUM_TOPICS, id2word = dictionary, passes=50)
ldamodel.save('model10.gensim')

# Print terms for topics
topics = ldamodel.print_topics(num_words=7)
for topic in topics:
    print(topic)

(0, '0.002*"repair" + 0.002*"ossia" + 0.001*"beyond" + 0.001*"protector" + 0.001*"gaming" + 0.001*"provider" + 0.001*"lease"')
(1, '0.001*"repair" + 0.001*"study" + 0.001*"ubreakifix" + 0.001*"update" + 0.001*"strategy" + 0.001*"analytics" + 0.001*"warranty"')
(2, '0.001*"update" + 0.001*"august" + 0.001*"tweet" + 0.001*"suggestion" + 0.001*"vanilla" + 0.001*"lockscreen" + 0.001*"phablet"')
(3, '0.001*"deadspots" + 0.001*"etnews" + 0.001*"input" + 0.001*"disable" + 0.001*"brother" + 0.001*"tier" + 0.001*"management"')
(4, '0.005*"iphone" + 0.004*"galaxy" + 0.004*"samsung" + 0.003*"apple" + 0.003*"battery" + 0.003*"camera" + 0.002*"screen"')
(5, '0.001*"lense" + 0.001*"journal" + 0.001*"monstrous" + 0.001*"nokia" + 0.001*"cram" + 0.001*"beyond" + 0.001*"biometric"')
(6, '0.002*"microsoft" + 0.002*"nokia" + 0.001*"policy" + 0.001*"interface" + 0.001*"14-day" + 0.001*"stereo" + 0.001*"conference"')
(7, '0.002*"quarter" + 0.001*"lawsuit" + 0.001*"movement" + 0.001*"refurbish" + 0.001*"chip

# pyLDAvis

In [25]:
# Create Visualizations of topic clusters using pyLDAvis

dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('/Users/ayankarim/Documents/Thinkful/Bootcamp/Final Capstone Opinion Mining/opinion_mining/Notebook/corpus_tfidf.pkl', 'rb'))


In [26]:
import pyLDAvis.gensim

# Display 3 topics
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [27]:
# Display 4 topics
lda4 = gensim.models.ldamodel.LdaModel.load('model4.gensim')
lda_display4 = pyLDAvis.gensim.prepare(lda4, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display4)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [28]:
# Display 5 topics
lda5 = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display5 = pyLDAvis.gensim.prepare(lda5, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display5)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
### SEEMS LIKE TOPIC MODELLING WONT WORK
### GO STRAIGHT TO ASPECT BASED SEMANTIC ANALYS

### TOPIC MODELLING MIGHT WORK WITH MUCH LARGER CORPUS OF DATA

### CREATE NEW JUPYTER NOTEBOOK FOR THIS TOPIC MODELLING CODE
### COPY JUST WHATS NEEDED FOR ABSA

### BRAINSTORM WHAT YOU WANT YOUR OUTPUT TO BE NOW, SHOULD BE EASIER TECHNICALLY
### RESEARCH HOW TO DO ABSA WITHOUT TOPICS

# MAY NEED TO SCRAPE TEST DATA. PERHAPS TWO DIFFERENT PHONES!!!!! TEST MODEL ON TEST DATA TO SEE HOW IT ANALYSES SENTIMENT AND OUTPUTS OUR RESULT.