In [1]:
import matplotlib
matplotlib.use('Agg')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [6]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Andy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Andy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Andy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [74]:
def clean(review):
    if (isNaN(review)):
        return ''
    review = re.sub("[^a-zA-z]", ' ', review)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)
    return review

def isNaN(string):
    return string != string

In [90]:
def clean(review):
    if (isNaN(review)):
        return ''
    # function to test if something is a noun
    is_noun = lambda pos: pos[:2] == 'NN'
    # do the nlp stuff
    review = re.sub("[^a-zA-z]", ' ', review)
    review = review.lower()
    tokenized = nltk.word_tokenize(review)
    ps = PorterStemmer()
    nouns = [ps.stem(word) for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]
    return nouns

In [91]:
clean2('Its A pretty good game and it came on time but')

['game', 'time']

In [92]:
train_df = pd.read_csv('train_raw.csv', index_col=0)
train_df['positive'] = train_df['star_rating'].apply(lambda x: 1 if x > 3 else 0)
train_df['clean_review'] = train_df['review_body'].apply(clean)
train_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,positive,clean_review
267779,US,3387522,R2JEMKCQT9QTVM,B008SBZD5U,150118808,Kingdom Hearts HD 1.5 Remix,Video Games,4,0,0,N,Y,KH!>% RULE!,Its A pretty good game and it came on time but...,2015-02-01,1,"[game, time, year, combo, ps]"
267796,US,15137367,R3IM5TXYMA7DCQ,B00IXMF5CU,292308774,Terraria - Xbox 360,Video Games,5,1,2,N,Y,Five Stars,Excellent,2015-02-01,1,[excel]
267818,US,15137367,R2PFXXB4EP23B,B00EM5UFEK,461081395,Plants vs. Zombies Garden Warfare,Video Games,4,0,0,N,Y,Four Stars,Excellent,2015-02-01,1,[excel]
267871,US,4354770,R1A9DB178PFXLH,B004OPYLTS,245894499,Fishdom - Nintendo DS,Video Games,5,0,0,N,Y,Fun Game,"Fun game, levels get harder as you progress.",2015-02-01,1,"[game, level]"
267892,US,6698937,R1RS2GAXENP9LF,B00BGA9X9W,943154724,DualShock 4 Wireless Controller for PlayStatio...,Video Games,4,0,0,N,Y,Four Stars,Great controller,2015-02-01,1,[control]


In [130]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [131]:
def product_sim(prod1, prod2):
    return jaccard_similarity([word for review in prod1.tolist() for word in review], [word for review in prod2.tolist() for word in review])

In [134]:
kingdom_hearts = train_df[train_df.product_parent == 150118808]['clean_review']
terraria = train_df[train_df.product_parent == 292308774]['clean_review']
controller = train_df[train_df.product_parent == 943154724]['clean_review']

In [135]:
print('p1: {} p2: {} sim: {}'.format('kingdom_hearts', 'terraria', product_sim(kingdom_hearts, terraria)))
print('p1: {} p2: {} sim: {}'.format('kingdom_hearts', 'controller', product_sim(kingdom_hearts, controller)))
print('p1: {} p2: {} sim: {}'.format('terraria', 'controller', product_sim(terraria, controller)))

p1: kingdom_hearts p2: terraria sim: 0.1018957345971564
p1: kingdom_hearts p2: controller sim: 0.17600700525394045
p1: terraria p2: controller sim: 0.052313883299798795


In [138]:
kingdom_hearts = train_df[train_df.product_parent == 150118808]['review_body'].apply(lambda x: x.split())
terraria = train_df[train_df.product_parent == 292308774]['review_body'].apply(lambda x: x.split())
controller = train_df[train_df.product_parent == 943154724]['review_body'].apply(lambda x: x.split())

In [139]:
print('p1: {} p2: {} sim: {}'.format('kingdom_hearts', 'terraria', product_sim(kingdom_hearts, terraria)))
print('p1: {} p2: {} sim: {}'.format('kingdom_hearts', 'controller', product_sim(kingdom_hearts, controller)))
print('p1: {} p2: {} sim: {}'.format('terraria', 'controller', product_sim(terraria, controller)))

p1: kingdom_hearts p2: terraria sim: 0.10227936879018118
p1: kingdom_hearts p2: controller sim: 0.1579828415986608
p1: terraria p2: controller sim: 0.05120630231413097


In [94]:
myId = 4354770
words = [item for l in train_df.query('customer_id == {} & positive == 0'.format(myId)).clean_review for item in l]
{w : words.count(w) for w in words}

{}

In [100]:
myId = 4354770
words = [w for review in train_df.query('customer_id == {} & positive == 1'.format(myId)).clean_review for w in review]
{w : words.count(w) for w in words}

{'game': 3,
 'level': 1,
 'i': 1,
 'ds': 1,
 'nintendo': 1,
 'problem': 1,
 'favorit': 1,
 'save': 1,
 'scratch': 1,
 'work': 1,
 'play': 1,
 'day': 1}

In [105]:
myId = 245894499
words = [item for l in train_df.query('product_parent == {} & positive == 1'.format(myId)).clean_review for item in l]
{w : words.count(w) for w in words}

{'game': 9,
 'level': 2,
 'fun': 2,
 'sister': 1,
 'type': 1,
 'addit': 1,
 'aquarium': 1,
 'color': 1,
 'i': 4,
 'version': 1,
 'line': 1,
 'match': 1,
 'part': 2,
 'par': 1,
 'graffic': 1,
 'tank': 1}

In [54]:
myId = 245894499
words = [item for l in train_df.query('product_parent == {} & positive == 1'.format(myId)).clean_review.apply(lambda x: x.split()) for item in l]
pos = {w : words.count(w) for w in words}

In [55]:
words = [item for l in train_df.query('product_parent == {} & positive == 0'.format(myId)).clean_review.apply(lambda x: x.split()) for item in l]
neg = {w : words.count(w) for w in words}

In [67]:
import heapq
pos_n = heapq.nlargest(5, pos, key=pos.get)
neg_n = heapq.nlargest(5, neg, key=neg.get)

In [68]:
pos_n

['game', 'fun', 'get', 'level', 'realli']

In [66]:
pos_n.difference(neg_n)

{'fun', 'game', 'get', 'realli'}

In [65]:
neg_n.difference(pos_n)

{'aquarium', 'chang', 'go', 'item'}