Data Preprocessing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import gzip
import json
import os
import matplotlib.pyplot as plt
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import unicodedata
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize 
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz

--2021-10-02 00:06:15--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 495854086 (473M) [application/x-gzip]
Saving to: ‘reviews_Electronics_5.json.gz’


2021-10-02 00:08:39 (3.29 MB/s) - ‘reviews_Electronics_5.json.gz’ saved [495854086/495854086]



In [4]:
### load the meta data

data = []
with gzip.open('reviews_Electronics_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# convert list into pandas dataframe

df = pd.DataFrame.from_dict(data)

Checking for null values

In [6]:
df.isnull().sum()

reviewerID            0
asin                  0
reviewerName      24730
helpful               0
reviewText            0
overall               0
summary               0
unixReviewTime        0
reviewTime            0
dtype: int64

In [5]:
df = df.dropna(subset=['reviewText'])
df.shape

(1689188, 9)

Concatenate review text and summary 

In [7]:
df['review_text'] = df[['summary', 'reviewText']].apply(lambda x: " ".join(str(y) for y in x if str(y) != 'nan'), axis = 1)
df = df.drop(['reviewText', 'summary'], axis = 1)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,unixReviewTime,reviewTime,review_text
0,AO94DHGC771SJ,0528881469,amazdnu,"[0, 0]",5.0,1370131200,"06 2, 2013",Gotta have GPS! We got this GPS for my husband...
1,AMO214LNFCEI4,0528881469,Amazon Customer,"[12, 15]",1.0,1290643200,"11 25, 2010",Very Disappointed I'm a professional OTR truck...
2,A3N7T0DY83Y4IG,0528881469,C. A. Freeman,"[43, 45]",3.0,1283990400,"09 9, 2010","1st impression Well, what can I say. I've had..."
3,A1H8PY3QHMQQA0,0528881469,"Dave M. Shaw ""mack dave""","[9, 10]",2.0,1290556800,"11 24, 2010","Great grafics, POOR GPS Not going to write a l..."
4,A24EV6RXELQZ63,0528881469,Wayne Smith,"[0, 0]",1.0,1317254400,"09 29, 2011","Major issues, only excuses for support I've ha..."
...,...,...,...,...,...,...,...,...
1689183,A34BZM6S9L7QI4,B00LGQ6HL8,"Candy Cane ""Is it just me?""","[1, 1]",5.0,1405555200,"07 17, 2014",Boom -- Pop -- Pow. These deliver. Burned the...
1689184,A1G650TTTHEAL5,B00LGQ6HL8,"Charles Spanky ""Zumina Reviews""","[0, 0]",5.0,1405382400,"07 15, 2014","Thin and light, without compromising on sound ..."
1689185,A25C2M3QF9G7OQ,B00LGQ6HL8,Comdet,"[0, 0]",5.0,1405555200,"07 17, 2014",Same form factor and durability as the S1 with...
1689186,A1E1LEVQ9VQNK,B00LGQ6HL8,J. Chambers,"[0, 0]",5.0,1405641600,"07 18, 2014",Superb audio quality in a very comfortable set...


Remove duplicate data based on **reviewerName**, **asin** and **review_text** columns

In [14]:
df = df.drop_duplicates(['asin','reviewerName', 'review_text'], keep = 'first')
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,unixReviewTime,reviewTime,review_text
0,AO94DHGC771SJ,0528881469,amazdnu,"[0, 0]",5.0,1370131200,"06 2, 2013",Gotta have GPS! We got this GPS for my husband...
1,AMO214LNFCEI4,0528881469,Amazon Customer,"[12, 15]",1.0,1290643200,"11 25, 2010",Very Disappointed I'm a professional OTR truck...
2,A3N7T0DY83Y4IG,0528881469,C. A. Freeman,"[43, 45]",3.0,1283990400,"09 9, 2010","1st impression Well, what can I say. I've had..."
3,A1H8PY3QHMQQA0,0528881469,"Dave M. Shaw ""mack dave""","[9, 10]",2.0,1290556800,"11 24, 2010","Great grafics, POOR GPS Not going to write a l..."
4,A24EV6RXELQZ63,0528881469,Wayne Smith,"[0, 0]",1.0,1317254400,"09 29, 2011","Major issues, only excuses for support I've ha..."
...,...,...,...,...,...,...,...,...
1689183,A34BZM6S9L7QI4,B00LGQ6HL8,"Candy Cane ""Is it just me?""","[1, 1]",5.0,1405555200,"07 17, 2014",Boom -- Pop -- Pow. These deliver. Burned the...
1689184,A1G650TTTHEAL5,B00LGQ6HL8,"Charles Spanky ""Zumina Reviews""","[0, 0]",5.0,1405382400,"07 15, 2014","Thin and light, without compromising on sound ..."
1689185,A25C2M3QF9G7OQ,B00LGQ6HL8,Comdet,"[0, 0]",5.0,1405555200,"07 17, 2014",Same form factor and durability as the S1 with...
1689186,A1E1LEVQ9VQNK,B00LGQ6HL8,J. Chambers,"[0, 0]",5.0,1405641600,"07 18, 2014",Superb audio quality in a very comfortable set...


In [16]:
##########################################
# Convert time object to datetime and create a new column named 'time'
##########################################
df['review_time'] = df.reviewTime.str.replace(',', "")
df['review_time'] = pd.to_datetime(df['review_time'], format = '%m %d %Y')

# Drop redundant 'reviewTime' column
df = df.drop('reviewTime', axis = 1)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,unixReviewTime,review_text,review_time
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",5.0,1370131200,Gotta have GPS! We got this GPS for my husband...,2013-06-02
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]",1.0,1290643200,Very Disappointed I'm a professional OTR truck...,2010-11-25
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]",3.0,1283990400,"1st impression Well, what can I say. I've had...",2010-09-09
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]",2.0,1290556800,"Great grafics, POOR GPS Not going to write a l...",2010-11-24
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",1.0,1317254400,"Major issues, only excuses for support I've ha...",2011-09-29


In [17]:
def clean_text(text):
  text = ' '.join(re.sub("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});", "", text).split())
  #Then tokenisation
  tokens = word_tokenize(text)
  # convert to lower case
  tokens = [w.lower() for w in tokens]
  # remove punctuation from each word
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  words = [word for word in stripped if word.isalpha()]
  # filter out stop words
  stop_words = set(stopwords.words('english'))
  # You can add more stop words here, specific for texts
  words = [w for w in words if not w in stop_words]
  # stemming of words
  lemmatizer = WordNetLemmatizer()
  words = [lemmatizer.lemmatize(word) for word in words]
  # Convert from list to a sentence again
  text = ' '.join(word for word in words)
  return text

In [18]:
#Process the segments here
df['review_text'] = df['review_text'].apply(clean_text)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,unixReviewTime,review_text,review_time
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",5.0,1370131200,got ta gps got gps husband otr road trucker im...,2013-06-02
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]",1.0,1290643200,disappointed professional otr truck driver bou...,2010-11-25
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]",3.0,1283990400,impression well say unit truck four day prior ...,2010-09-09
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]",2.0,1290556800,great grafics poor gps going write long review...,2010-11-24
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",1.0,1317254400,major issue excuse support mine year got try r...,2011-09-29


In [19]:
df.to_csv("/content/drive/MyDrive/Dataset/Electronics_processed.csv")