In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wilson289296/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/wilson289296/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
clean_data = pd.read_csv('Reviews.csv')

In [3]:
clean_data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


# CF is infeasible with this dataset

In [6]:
print(f"Number of unique products: {len(clean_data['ProductId'].unique())}")
print(f"NUmber of unique users: {len(clean_data['UserId'].unique())}")

Number of unique products: 74258
NUmber of unique users: 256059


# So we'll do NLP to see if we can predict the score and helpfulness

In [11]:
#misc feature engineering
data = clean_data.copy()
# data['HelpRatio'] = data['HelpfulnessNumerator']/data['HelpfulnessDenominator']
# data['HelpRatio'] = data['HelpRatio'].apply(lambda x: 0 if np.isnan(x) else x)
data['Time'] = (data['Time'] - data['Time'].min())/(data['Time'].max() - data['Time'].min())

In [11]:
#convert all strings to lower
data['CleanText'] = data['CleanText'].apply(lambda x: x.lower())

In [11]:
#remove punctuation
data['CleanText'] = data['CleanText'].apply(lambda x: "".join([char for char in x if char not in string.punctuation]))

In [11]:
#tokenization
data['CleanText'] = data['CleanText'].apply(lambda x: nltk.word_tokenize(x))

In [12]:
#stopword removal
stop_words = stopwords.words('english')
data['CleanText'] = data['CleanText'].apply(lambda x: [word for word in x if word not in stop_words])

In [13]:
#stemming
porter = PorterStemmer()
data['CleanText'] = data['CleanText'].apply(lambda x: [porter.stem(word) for word in x])

In [20]:
for i in range(3):
    data.loc[i]['CleanText']

['0',
 'good',
 'qualiti',
 'dog',
 'food',
 'bought',
 'sever',
 '1',
 'advertis',
 'product',
 'arriv',
 'label',
 'j',
 '2',
 'delight',
 'say',
 'confect',
 'tha',
 '3',
 'cough',
 'medicin',
 'look',
 'secr',
 '4',
 'great',
 'taffi',
 'great',
 'taffi',
 'great',
 'price',
 '568449',
 'without',
 'great',
 'sesam',
 'chicken',
 '568450',
 'disappoint',
 'im',
 'disappoint',
 'flavor',
 '568451',
 'perfect',
 'maltipoo',
 'star',
 'small',
 '568452',
 'favorit',
 'train',
 'reward',
 'treat',
 '568453',
 'great',
 'honey',
 'satisfi',
 'product',
 'length',
 '568454',
 'dtype',
 'object']

['0',
 'good',
 'qualiti',
 'dog',
 'food',
 'bought',
 'sever',
 '1',
 'advertis',
 'product',
 'arriv',
 'label',
 'j',
 '2',
 'delight',
 'say',
 'confect',
 'tha',
 '3',
 'cough',
 'medicin',
 'look',
 'secr',
 '4',
 'great',
 'taffi',
 'great',
 'taffi',
 'great',
 'price',
 '568449',
 'without',
 'great',
 'sesam',
 'chicken',
 '568450',
 'disappoint',
 'im',
 'disappoint',
 'flavor',
 '568451',
 'perfect',
 'maltipoo',
 'star',
 'small',
 '568452',
 'favorit',
 'train',
 'reward',
 'treat',
 '568453',
 'great',
 'honey',
 'satisfi',
 'product',
 'length',
 '568454',
 'dtype',
 'object']

['0',
 'good',
 'qualiti',
 'dog',
 'food',
 'bought',
 'sever',
 '1',
 'advertis',
 'product',
 'arriv',
 'label',
 'j',
 '2',
 'delight',
 'say',
 'confect',
 'tha',
 '3',
 'cough',
 'medicin',
 'look',
 'secr',
 '4',
 'great',
 'taffi',
 'great',
 'taffi',
 'great',
 'price',
 '568449',
 'without',
 'great',
 'sesam',
 'chicken',
 '568450',
 'disappoint',
 'im',
 'disappoint',
 'flavor',
 '568451',
 'perfect',
 'maltipoo',
 'star',
 'small',
 '568452',
 'favorit',
 'train',
 'reward',
 'treat',
 '568453',
 'great',
 'honey',
 'satisfi',
 'product',
 'length',
 '568454',
 'dtype',
 'object']

In [16]:
data.to_csv('preproc_Reviews.csv')