DSS: Mobile Phone Reviews-Data Cleanup and Vectorization

In [4]:
# Import modules
import seaborn as sns
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
# import tensorflow as tf
import zipfile
from pathlib import Path

In [5]:
# Import data
data_static = pd.read_csv("../../Data/Amazon_Unlocked_Mobile.csv")
data = data_static.copy(deep=True)

In [6]:
# Structure of Data
data.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [7]:
# Number of Entries
# We have 413840 total rows
data.shape[0]

413840

In [8]:
num_null_Brand_Name = data['Brand Name'].isnull().sum()
num_null_Product_Name = data['Product Name'].isnull().sum()
num_null_Price = data['Price'].isnull().sum()
num_null_Rating = data['Rating'].isnull().sum()
num_null_Reviews = data['Reviews'].isnull().sum()
num_null_Review_Votes = data['Review Votes'].isnull().sum()

In [9]:
# All data where either Rating, Reviews, Review Votes, or Price is null
num_remove = data[data['Rating'].isnull() | data['Reviews'].isnull() | data['Review Votes'].isnull() | data['Price'].isnull()].shape[0]
num_remove
# There are 18115 rows to remove

18115

In [10]:
# Remove Product Name and Brand Name
data_dropped = data.drop(['Product Name','Brand Name'], axis = 1)
# Remove null values
cleaned_data = data_dropped.drop(data_dropped[ data_dropped['Rating'].isnull() | data_dropped['Reviews'].isnull() | data_dropped['Review Votes'].isnull() | data_dropped['Price'].isnull()].index)
cleaned_data # Has Product and Brand Name Removed and All Null values removed

Unnamed: 0,Price,Rating,Reviews,Review Votes
0,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,199.99,5,Very pleased,0.0
3,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,199.99,4,Great phone to replace my lost phone. The only...,0.0
5,199.99,1,I already had a phone with problems... I know ...,1.0
6,199.99,2,The charging port was loose. I got that solder...,0.0
7,199.99,2,"Phone looks good but wouldn't stay charged, ha...",0.0
8,199.99,5,I originally was using the Samsung S2 Galaxy f...,0.0
9,199.99,3,It's battery life is great. It's very responsi...,0.0


Here I used the .translate function to remove punctuation from the reviews. Since our dataset is so large, str.replace would take too long and could potentially crash kernels. The translate function is a python function based in C and works very fast. I first join all the items with a character sequence that does not exist in the data: '|aa'. I then translate all the punctuations in the punctuations variable and finally split on this sequence with '|aa'.

In [12]:
punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{}~' 
translations = str.maketrans(dict.fromkeys(punctuations, ''))

cleaned_data['Reviews'] = '|aa'.join(cleaned_data['Reviews'].tolist()).translate(translations).split('|aa')

In [13]:
cleaned_data

Unnamed: 0,Price,Rating,Reviews,Review Votes
0,199.99,5,I feel so LUCKY to have found this used phone ...,1.0
1,199.99,4,nice phone nice up grade from my pantach revue...,0.0
2,199.99,5,Very pleased,0.0
3,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,199.99,4,Great phone to replace my lost phone The only ...,0.0
5,199.99,1,I already had a phone with problems I know it ...,1.0
6,199.99,2,The charging port was loose I got that soldere...,0.0
7,199.99,2,Phone looks good but wouldnt stay charged had ...,0.0
8,199.99,5,I originally was using the Samsung S2 Galaxy f...,0.0
9,199.99,3,Its battery life is great Its very responsive ...,0.0


In [None]:
unnecessary_words = ['it', 'I', 'the', ]