In [1]:
# Import Dependencies and modules
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from string import punctuation
from collections import Counter
from io import StringIO
import nltk
import glob
import errno
import os
import json

In [2]:
# Load each json file
with open('iphonex_digtrends.json') as f:
    iphonex_digtrends = json.load(f)

with open('iphonex_gizmodo.json') as f:
    iphonex_gizmodo = json.load(f)

with open('iphonex_techradar.json') as f:
    iphonex_techradar = json.load(f)

with open('pixel3_digtrends.json') as f:
    pixel3_digtrends = json.load(f)

with open('pixel3_gizmodo.json') as f:
    pixel3_gizmodo = json.load(f)

with open('pixel3_techradar.json') as f:
    pixel3_techradar = json.load(f)

In [3]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!

    text = str(text).replace("\n", "")
    text = str(text).replace("\t", "")
    text = str(text).replace("\\n", "")
    text = str(text).replace("\\t", "")
    text = str(text).replace("\\", "")
    text = str(text).replace("xa0", " ")
    text = str(text).replace("\'", "")
    text = re.sub("<p>", "", str(text))
    text = re.sub("</p>", "", str(text))
    text = re.sub("</a>", "", str(text))
    text = re.sub('<[^>]+>', "", str(text))
    text = str(text).replace("\\u2019", "")
    text = str(text).replace("\\u2013", "")
    text = str(text).replace("\\u2018", "")
    text = str(text).replace("\\u00a0", "")
    text = str(text).replace("\\u00a3", "")
    text = str(text).replace("\u2014", "")
    text = str(text).replace("\u201d", "")
    text = str(text).replace("\u201c", "")
    return text


In [4]:
# Populate each JSON file into a data frame

iphonex_digtrends = pd.DataFrame.from_dict(iphonex_digtrends, orient='columns')
iphonex_gizmodo = pd.DataFrame.from_dict(iphonex_gizmodo, orient='columns')
iphonex_techradar = pd.DataFrame.from_dict(iphonex_techradar, orient='columns')
pixel3_digtrends = pd.DataFrame.from_dict(pixel3_digtrends, orient='columns')
pixel3_gizmodo = pd.DataFrame.from_dict(pixel3_gizmodo, orient='columns')
pixel3_techradar = pd.DataFrame.from_dict(pixel3_techradar, orient='columns')

In [5]:
# Define function to clean text
def clean_text(df):
    # Convert lists to strings and remove brackets
    df['text'] = df['text'].astype(str)
    df['author'] = df['author'].astype(str)

    df['text'] = df['text'].map(lambda x: x.strip('[]'))
    df['author'] = df['author'].map(lambda x: x.strip('[]'))

    # Clean text
    df['text'] = df['text'].apply(lambda x: text_cleaner(x))
    df['title'] = df['title'].apply(lambda x: text_cleaner(x))
    df['author'] = df['author'].apply(lambda x: text_cleaner(x))
    
# Put dataframes into a list to iterate through
dataframes = [iphonex_digtrends, iphonex_gizmodo, iphonex_techradar, pixel3_digtrends, pixel3_gizmodo, pixel3_techradar]

# Clean each Data Frame
for dataframe in dataframes:
    clean_text(dataframe)

In [6]:
# Label all the rows in the dataframe for the phone that the article is talking about

iphones = [iphonex_digtrends, iphonex_gizmodo, iphonex_techradar]
pixel3s = [pixel3_digtrends, pixel3_gizmodo, pixel3_techradar]

for dataframe in iphones:
    dataframe['phone'] = 'IPhone X'
    
for dataframe in pixel3s:
    dataframe['phone'] = 'Google Pixel 3'


In [7]:
# Concat all the dataframes into one dataframe
all_frames = [iphonex_digtrends, iphonex_gizmodo, iphonex_techradar, pixel3_digtrends, pixel3_gizmodo, pixel3_techradar]
df = pd.concat(all_frames)


In [8]:
# Visualize dataframe
df

Unnamed: 0,author,text,title,phone
0,Eric Brackett,The iPhone X launched to stellar reviews and e...,Shrinking demand forces Apple to slow down iPh...,IPhone X
1,Lucas Coll,"When it comes to high-quality devices, like th...",Looking to upgrade? These are the best iPhone ...,IPhone X
2,Simon Hill,The iPhone X is completely different from any ...,"The most common iPhone X problems, and how to ...",IPhone X
3,Trevor Mogg,"If you’re in the market for an iPhone X, and p...","This $4,600 solar charger comes with an iPhone...",IPhone X
4,Mark Jansen,", The initial estimates, set during the Novemb...",Apple will halve iPhone X production after lim...,IPhone X
5,Lucas Coll,This year has been a pretty good one for flags...,"Buy any iPhone X, XS, or XS Max from Verizon a...",IPhone X
6,Mark Jansen,"The iPhone X has an incredible OLED display, g...",The best iPhone X battery cases,IPhone X
7,Brenda Stolyar,Last year’s iOS 11 update added a little more ...,Apple iOS 12 review,IPhone X
8,Christian de Looper,Reports may suggest that Apple wasn’t all that...,Apple raked in more than half of all global sm...,IPhone X
9,Eric Brackett,Apple and Samsung are major rivals in the smar...,High cost of OLED displays spells trouble for ...,IPhone X


In [9]:
# Preprocess for NLP!!!
    # Tokenize text
    # Remove Stopwords, or keep it, might be important for aspect based semantics
    # Lowercase everything
    # Remove all punctuations

# Read through Peter Min's medium post and take notes on ideas that is useful to your project
# and should be implemented