In [99]:
## Imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pickle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import re
import codecs

from scipy.sparse import csr_matrix
import sklearn
import spacy
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.utils.extmath import randomized_svd
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.ensemble import RandomForestClassifier


## EDA

In [2]:
yelp_business = pd.read_csv('/Users/iris/Downloads/yelp_business.csv')
yelp_attributes = pd.read_csv('/Users/iris/Downloads/yelp_business_attributes.csv')
yelp_tip = pd.read_csv('/Users/iris/Downloads/yelp_tip.csv')
yelp_users = pd.read_csv('/Users/iris/Downloads/yelp_user.csv')
yelp_review = pd.read_csv('/Users/iris/Downloads/yelp_review.csv')

In [3]:
yelp_business.head(2)

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",,"""4855 E Warner Rd, Ste B9""",Ahwatukee,AZ,85044,33.33069,-111.978599,4.0,22,1,Dentists;General Dentistry;Health & Medical;Or...
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",,"""3101 Washington Rd""",McMurray,PA,15317,40.291685,-80.1049,3.0,11,1,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...


In [4]:
yelp_business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174567 entries, 0 to 174566
Data columns (total 13 columns):
business_id     174567 non-null object
name            174567 non-null object
neighborhood    68015 non-null object
address         174567 non-null object
city            174566 non-null object
state           174566 non-null object
postal_code     173944 non-null object
latitude        174566 non-null float64
longitude       174566 non-null float64
stars           174567 non-null float64
review_count    174567 non-null int64
is_open         174567 non-null int64
categories      174567 non-null object
dtypes: float64(3), int64(2), object(8)
memory usage: 17.3+ MB


In [5]:
#Limit businesses to resturants only
restaurants = yelp_business[yelp_business['categories'].str.contains('Restaurants')]
restaurants.describe()

Unnamed: 0,latitude,longitude,stars,review_count,is_open
count,54618.0,54618.0,54618.0,54618.0,54618.0
mean,40.333631,-85.102962,3.456187,58.973818,0.739573
std,5.644022,28.368167,0.793384,150.760885,0.438872
min,-34.515952,-142.46665,1.0,3.0,0.0
25%,35.298741,-111.927162,3.0,7.0,0.0
50%,41.362593,-80.804737,3.5,18.0,1.0
75%,43.72598,-79.386494,4.0,54.0,1.0
max,89.999314,115.086769,5.0,7361.0,1.0


In [6]:
#Restaurants with 20 or more reviews
restaurants = restaurants[restaurants['review_count'] > 20]

In [7]:
restaurants.groupby('is_open')['business_id'].nunique()

is_open
0     4905
1    20646
Name: business_id, dtype: int64

In [8]:
#get rid of closed restaurants
#restaurants = restaurants[restaurants['is_open'] == 1]

In [9]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25551 entries, 4 to 174558
Data columns (total 13 columns):
business_id     25551 non-null object
name            25551 non-null object
neighborhood    11913 non-null object
address         25551 non-null object
city            25551 non-null object
state           25551 non-null object
postal_code     25536 non-null object
latitude        25551 non-null float64
longitude       25551 non-null float64
stars           25551 non-null float64
review_count    25551 non-null int64
is_open         25551 non-null int64
categories      25551 non-null object
dtypes: float64(3), int64(2), object(8)
memory usage: 2.7+ MB


In [10]:
restaurants = restaurants.drop(['neighborhood', 'address','postal_code','longitude','latitude','is_open'], axis = 1)

In [11]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25551 entries, 4 to 174558
Data columns (total 7 columns):
business_id     25551 non-null object
name            25551 non-null object
city            25551 non-null object
state           25551 non-null object
stars           25551 non-null float64
review_count    25551 non-null int64
categories      25551 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 1.6+ MB


In [12]:
#Too many NAN to be useful
yelp_attributes.head()

Unnamed: 0,business_id,AcceptsInsurance,ByAppointmentOnly,BusinessAcceptsCreditCards,BusinessParking_garage,BusinessParking_street,BusinessParking_validated,BusinessParking_lot,BusinessParking_valet,HairSpecializesIn_coloring,...,Corkage,DietaryRestrictions_dairy-free,DietaryRestrictions_gluten-free,DietaryRestrictions_vegan,DietaryRestrictions_kosher,DietaryRestrictions_halal,DietaryRestrictions_soy-free,DietaryRestrictions_vegetarian,AgesAllowed,RestaurantsCounterService
0,FYWN1wneV18bWNgQjJ2GNg,Na,Na,Na,True,Na,Na,Na,Na,Na,...,Na,Na,Na,Na,Na,Na,Na,Na,Na,Na
1,He-G7vWjzVUysIKrfNbPUQ,Na,Na,Na,Na,Na,Na,Na,Na,Na,...,Na,Na,Na,Na,Na,Na,Na,Na,Na,Na
2,8DShNS-LuFqpEWIp0HxijA,Na,Na,Na,Na,Na,Na,Na,Na,Na,...,Na,Na,Na,Na,Na,Na,Na,Na,Na,Na
3,PfOCPjBrlQAnz__NXj9h_w,Na,Na,Na,Na,Na,Na,Na,Na,Na,...,Na,Na,Na,Na,Na,Na,Na,Na,Na,Na
4,o9eMRCWt5PkpLDE0gOPtcQ,Na,Na,Na,Na,False,False,False,False,False,...,Na,Na,Na,Na,Na,Na,Na,Na,Na,Na


In [13]:
restaurants[restaurants['categories'].str.contains('Italian')]

Unnamed: 0,business_id,name,city,state,stars,review_count,categories
44,BnuzcebyB1AfxH0kjNWqSg,"""Carrabba's Italian Grill""",Frazer,PA,3.5,25,Restaurants;Italian;Seafood
90,aFBCmJUYrPeol_P75QX7Jw,"""Fraticelli's Authentic Italian Grill""",Richmond Hill,ON,3.0,88,Restaurants;Italian
178,mLP-jIIRdoaYM7fJqqFurw,"""Napoli Pizzeria""",Pittsburgh,PA,4.0,53,Italian;Pizza;Restaurants;Salad
220,Qv0OEziLJwyAqcgtrTsA4w,"""Luke's Of Chicago""",Scottsdale,AZ,3.5,50,Italian;Restaurants;Sandwiches;Hot Dogs
243,XguKrY0dAuaK1W6HUlUQ1Q,"""Retz's Laconi's II""",Cuyahoga Falls,OH,3.5,29,Italian;Restaurants;Pizza
273,3b4efqz06QrLQ_w2xLc4pA,"""Olive Garden Italian Restaurant""",Phoenix,AZ,3.0,40,Restaurants;Italian
274,dPxZI9lrKTl5dvFfnb1_Ig,"""Trattoria Italia""",Las Vegas,NV,4.5,210,Seafood;Italian;Pizza;Restaurants
665,LhWwwDJ2l79a1tHrpcdK6A,"""Scaddabush Woodbridge""",Vaughan,ON,3.0,22,Nightlife;Restaurants;Bars;Italian
888,l0WauZ92kRg9bu2oxHj4TQ,"""Aiello's East Coast Italian""",Phoenix,AZ,4.0,130,Restaurants;Italian
1075,NlBifQZCNUAnbuo_wKIVpg,"""Bada Bing Pizza""",Painesville,OH,4.5,31,Pizza;Italian;Restaurants


In [14]:
yelp_tip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098324 entries, 0 to 1098323
Data columns (total 5 columns):
text           1098322 non-null object
date           1098324 non-null object
likes          1098324 non-null int64
business_id    1098324 non-null object
user_id        1098324 non-null object
dtypes: int64(1), object(4)
memory usage: 41.9+ MB


In [15]:
#df1 = df1.merge(df2[['weeknum', 'datetime']], on=['weeknum'])
restaurant_tips = restaurants.merge(yelp_tip[['text','business_id','user_id']], on = ['business_id'])

In [16]:
restaurant_tips.head()

Unnamed: 0,business_id,name,city,state,stars,review_count,categories,text,user_id
0,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",Cuyahoga Falls,OH,3.5,116,American (New);Nightlife;Bars;Sandwiches;Ameri...,People are giving this place 1 star because of...,8v1rpTH0d8YjjGWA892l1Q
1,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",Cuyahoga Falls,OH,3.5,116,American (New);Nightlife;Bars;Sandwiches;Ameri...,"Excellent selection of beers on tap, many scre...",y_RCzjU-K_KrMBko3mMhsg
2,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",Cuyahoga Falls,OH,3.5,116,American (New);Nightlife;Bars;Sandwiches;Ameri...,Great event held by yelp!! Will be back!!,3OhkexT0E78zylIH426EOg
3,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",Cuyahoga Falls,OH,3.5,116,American (New);Nightlife;Bars;Sandwiches;Ameri...,I love this place. They always take good care ...,RgTByLRNRTJNmeWgp6Kg4Q
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",Cuyahoga Falls,OH,3.5,116,American (New);Nightlife;Bars;Sandwiches;Ameri...,Chop Steak was surprisingly good. Love love lo...,RgTByLRNRTJNmeWgp6Kg4Q


In [17]:
restaurant_tips.business_id.nunique()

25191

In [18]:
restaurant_tips.categories.unique()

array(['American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants',
       'French;Food;Bakeries;Restaurants',
       'Tiki Bars;Nightlife;Mexican;Restaurants;Bars', ...,
       'Bars;Gastropubs;Nightlife;Restaurants;Pubs;Sandwiches',
       'Nightlife;Restaurants;Sandwiches;Bars;Canadian (New);Asian Fusion;Pubs',
       'Pizza;Event Planning & Services;Italian;Caterers;Restaurants'],
      dtype=object)

In [19]:
all_cat = restaurant_tips.categories

In [20]:
all_cat = list(all_cat)
print(all_cat[:10])

['American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants', 'American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants', 'American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants', 'American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants', 'American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants', 'American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants', 'American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants', 'American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants', 'American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants', 'American (New);Nightlife;Bars;Sandwiches;American (Traditional);Burgers;Restaurants']


In [21]:
##How many unique restaurant categories are there?
k = set(all_cat)

In [22]:
list(k)

['Restaurants;Bars;Nightlife;Breakfast & Brunch;American (Traditional)',
 'Chinese;Mongolian;Restaurants;Hot Pot',
 'Restaurants;Breakfast & Brunch;American (Traditional);Diners',
 'Vietnamese;Bubble Tea;Sandwiches;Food;Coffee & Tea;Restaurants',
 'Restaurants;Japanese;Sushi Bars;Thai',
 'American (New);Breakfast & Brunch;Cafes;Restaurants',
 'Pizza;Food;Restaurants;Italian;Desserts',
 'Cafes;Bakeries;Food;Custom Cakes;Desserts;Restaurants',
 'Korean;Restaurants;Asian Fusion;Barbeque',
 'Buffets;Indian;Pakistani;Restaurants',
 'Coffee & Tea;Nightlife;Food;American (New);Tapas/Small Plates;Cocktail Bars;Desserts;Gastropubs;Restaurants;Bars;Breakfast & Brunch;American (Traditional)',
 'Seafood;Restaurants;American (New);Burgers',
 'Ramen;Poke;Food;Japanese;Asian Fusion;Gluten-Free;Restaurants;Sushi Bars',
 'American (New);Nightlife;Bars;Beer Bar;Cocktail Bars;Gastropubs;Restaurants;British',
 'Restaurants;Caterers;Event Planning & Services;Pizza;Mediterranean',
 'Scottish;Restaurants;Gas

### collaborative is peer to peer. look at peers who have liked the same restaurants (likes == greater than the predicted base line for each

In [23]:
restaurants.head()

Unnamed: 0,business_id,name,city,state,stars,review_count,categories
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",Cuyahoga Falls,OH,3.5,116,American (New);Nightlife;Bars;Sandwiches;Ameri...
19,Gu-xs3NIQTj3Mj2xYoN2aw,"""Maxim Bakery & Restaurant""",Richmond Hill,ON,3.5,34,French;Food;Bakeries;Restaurants
25,1K4qrnfyzKzGgJPBEcJaNQ,"""Chula Taberna Mexicana""",Toronto,ON,3.5,39,Tiki Bars;Nightlife;Mexican;Restaurants;Bars
33,FXHfcFVEfI1vVngW2gVOpw,"""Bampot House of Tea & Board Games""",Toronto,ON,4.0,55,Coffee & Tea;Restaurants;Food;Mediterranean;Te...
40,tRVx2c89coruPRwYhGTcTw,"""Yuzu""",Lakewood,OH,3.5,78,Nightlife;Izakaya;Comfort Food;Cocktail Bars;A...


In [24]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25551 entries, 4 to 174558
Data columns (total 7 columns):
business_id     25551 non-null object
name            25551 non-null object
city            25551 non-null object
state           25551 non-null object
stars           25551 non-null float64
review_count    25551 non-null int64
categories      25551 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 1.6+ MB


In [25]:
yelp_users.head()

Unnamed: 0,user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,JJ-aSuM4pCFPdkfoZ34q0Q,Chris,10,2013-09-24,"0njfJmB-7n84DlIgUByCNw, rFn3Xe3RqHxRSxWOU19Gpg...",0,0,0,0,,...,0,0,0,0,0,0,0,0,0,0
1,uUzsFQn_6cXDh6rPNGbIFA,Tiffy,1,2017-03-02,,0,0,0,0,,...,0,0,0,0,0,0,0,0,0,0
2,mBneaEEH5EMyxaVyqS-72A,Mark,6,2015-03-13,,0,0,0,0,,...,0,0,0,0,0,0,0,0,0,0
3,W5mJGs-dcDWRGEhAzUYtoA,Evelyn,3,2016-09-08,,0,0,0,0,,...,0,0,0,0,0,0,0,0,0,0
4,4E8--zUZO1Rr1IBK4_83fg,Lisa,11,2012-07-16,,4,0,0,0,,...,0,0,0,0,0,0,0,0,1,0


In [26]:
yelp_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1326100 entries, 0 to 1326099
Data columns (total 22 columns):
user_id               1326100 non-null object
name                  1325603 non-null object
review_count          1326100 non-null int64
yelping_since         1326100 non-null object
friends               1326100 non-null object
useful                1326100 non-null int64
funny                 1326100 non-null int64
cool                  1326100 non-null int64
fans                  1326100 non-null int64
elite                 1326100 non-null object
average_stars         1326100 non-null float64
compliment_hot        1326100 non-null int64
compliment_more       1326100 non-null int64
compliment_profile    1326100 non-null int64
compliment_cute       1326100 non-null int64
compliment_list       1326100 non-null int64
compliment_note       1326100 non-null int64
compliment_plain      1326100 non-null int64
compliment_cool       1326100 non-null int64
compliment_funny      132

In [27]:
users = yelp_users.filter(['user_id','review_count','business_id','average_stars'], axis = 1)
users_15 = users[users['review_count']> 15]
users_15.head()

Unnamed: 0,user_id,review_count,average_stars
18,jYnkJR3T8yCERXywoVhWYA,48,3.73
44,fV8Yr0c5tFQTQ2SRRJHXHw,50,3.96
52,bZkZgll3Fii18x3WRtB5Lg,62,3.3
73,gkMSWgpoBcc96JqhMFbKJg,19,4.26
74,aw973Pm1nrTbRjP4zY9B9g,762,3.6


In [28]:
users_15.user_id.nunique()

330694

In [29]:
res_20 = restaurants[restaurants['review_count'] > 19]
res_20.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25551 entries, 4 to 174558
Data columns (total 7 columns):
business_id     25551 non-null object
name            25551 non-null object
city            25551 non-null object
state           25551 non-null object
stars           25551 non-null float64
review_count    25551 non-null int64
categories      25551 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 1.6+ MB


In [30]:
yelp_review.head(2)

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0


In [31]:
yelp_review['user_id'].nunique()

1326101

In [32]:
yelp_review_subset = yelp_review[:100]
yelp_review_subset.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [33]:
yelp_review_subset.groupby('user_id')['review_id'].count()

user_id
_4iMDXbXZ1p1ONG297YEAQ     1
_L2SZSwf7A6YSrIHy_q4cw     5
bv2nCi5Qv5vroFiqKGopiw     5
nOTl4aPC4tKHK35T3bNauQ     3
nsOf58RZjMTn8V94EQYJog     1
r9HwWzRRd0EvcwnDX9TLTg     1
ssuXFjkH4neiBgwv-oN4IA     3
u0LXt3Uea_GidxRW1xcsfg    80
xJbxpra6iyCsph35Pt0cZg     1
Name: review_id, dtype: int64

In [34]:
review_count = yelp_review.groupby('user_id')['review_id'].count()
review_count = review_count.to_frame()

In [35]:
review_count.head()

Unnamed: 0_level_0,review_id
user_id,Unnamed: 1_level_1
---1lKK3aKOuomHnwAkAow,119
---94vtJ_5o_nikEs6hUjg,1
---PLwSf5gKdIoVnyRHgBA,2
---cu1hq55BP9DWVXXKHZg,3
---fhiwiwBYrvqhpXgcWDQ,1


In [36]:
review_count.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1326101 entries, ---1lKK3aKOuomHnwAkAow to zzzmshdEWLFCApxETl1TGQ
Data columns (total 1 columns):
review_id    1326101 non-null int64
dtypes: int64(1)
memory usage: 20.2+ MB


In [37]:
review_count['review_id'].max()

3569

In [38]:
## Selec only reviewers with over 15 reviews
review_count_15 = review_count[review_count['review_id'] > 15]

In [39]:
review_count_15.head()

Unnamed: 0_level_0,review_id
user_id,Unnamed: 1_level_1
---1lKK3aKOuomHnwAkAow,119
--2HUmLkcNHZp0xw6AMBPg,27
--2vR0DIsmQ6WfcSzKWigw,37
--4rAAfZnEIAKJE80aIiYg,26
--BumyUHiO_7YsHurb9Hkw,46


In [40]:
users_15.user_id.nunique()

330694

In [41]:
t = ['bv2nCi5Qv5vroFiqKGopiw']
yelp_review_subset[yelp_review_subset['user_id'].isin(t)]

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [42]:
## get ratings from users who wrote more than 15 of them
super_users = list(users_15.user_id)
yelp_review_super_users = yelp_review[yelp_review['user_id'].isin(super_users)]

In [43]:
yelp_review_super_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3113278 entries, 6 to 5261667
Data columns (total 9 columns):
review_id      object
user_id        object
business_id    object
stars          int64
date           object
text           object
useful         int64
funny          int64
cool           int64
dtypes: int64(4), object(5)
memory usage: 237.5+ MB


In [44]:
yelp_review_super_users = yelp_review_super_users.drop(['date','text','useful','funny','cool'], axis = 1)

In [45]:
yelp_review_super_users.user_id.value_counts()

CxDOIDnH8gp9KXzpBHJYXw    3569
bLbSNkLggFnqwNNzzq-Ijw    2077
PKEzKWv_FktMm2mGPjwd0Q    1611
DK57YibC5ShBmqQl97CKog    1463
QJI9OSEn6ujRCtrX06vs1w    1322
d_TBs6J3twMy9GChqUEXkg    1184
ELcQDlf69kb-ihJfxZyL0A    1159
cMEtAiW60I5wE_vLfTxoJQ    1126
hWDybu_KvYLSdEFzGrniTw    1117
U4INQZOPSUaj8hMjLlZ3KA    1101
UYcmGbelzRa0Q6JqzLoguw    1092
62GNFh5FySkA3MbrQmnqvg    1051
dIIKEfOgo0KqUfGQvGikPg     989
n86B7IkbU20AkxlFX_5aew     955
iDlkZO2iILS8Jwfdy7DP9A     947
N3oNEwh0qgPqPP3Em6wJXw     926
rCWrxuRC8_pfagpchtHp6A     915
Ry1O_KXZHGRI8g5zBR3IcQ     896
0BBUmH7Krcax1RZgbH4fSA     874
pMefTWo6gMdx8WhYSA2u3w     838
3nDUQBjKyVor5wV0reJChg     833
WeVkkF5L39888IPPlRhNpg     830
YMgZqBUAddmFErxLtCfK_w     823
U5YQX_vMl_xQy8EQDqlNQQ     810
Q9mA60HnY87C1TW5kjAZ6Q     803
dt9IHwfuZs9D9LOH7gjNew     798
fiGqQ7pIGKyZ9G0RqWLMpg     787
PeLGa5vUR8_mcsn-fn42Jg     775
Wc5L6iuvSNF5WGBlqIO8nw     772
Xxvz5g67eaCr3emnkY5M6w     772
                          ... 
nC-7df8T_4LeI6WOqd1p3Q       1
eMy1RwoE

In [46]:
pop_business_id_list = list(res_20.business_id)
yelp_review_super_users = yelp_review_super_users[yelp_review_super_users['business_id'].isin(pop_business_id_list)]
yelp_review_super_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1882930 entries, 6 to 5261667
Data columns (total 4 columns):
review_id      object
user_id        object
business_id    object
stars          int64
dtypes: int64(1), object(3)
memory usage: 71.8+ MB


In [48]:
print(len(pop_business_id_list))
yelp_review_super_users.user_id.nunique()

25551


271728

In [49]:
yelp_review_super_users.head(2)

Unnamed: 0,review_id,user_id,business_id,stars
6,ymAUG8DZfQcFTBSOiaNN4w,u0LXt3Uea_GidxRW1xcsfg,9_CGhHMz8698M9-PkVf0CQ,4
8,w41ZS9shepfO3uEyhXEWuQ,u0LXt3Uea_GidxRW1xcsfg,5r6-G9C4YLbC7Ziz57l3rQ,3


In [50]:
yelp_review_super_users = yelp_review_super_users.drop(['date','text','useful','funny','cool'], axis = 1)
yelp_review_super_users.head(2)

KeyError: "labels ['date' 'text' 'useful' 'funny' 'cool'] not contained in axis"

In [51]:
model = yelp_review_super_users.drop(['review_id'],axis = 1)
model.head(2)

Unnamed: 0,user_id,business_id,stars
6,u0LXt3Uea_GidxRW1xcsfg,9_CGhHMz8698M9-PkVf0CQ,4
8,u0LXt3Uea_GidxRW1xcsfg,5r6-G9C4YLbC7Ziz57l3rQ,3


In [52]:
model.business_id.nunique()

25551

In [53]:
model.head(10)

Unnamed: 0,user_id,business_id,stars
6,u0LXt3Uea_GidxRW1xcsfg,9_CGhHMz8698M9-PkVf0CQ,4
8,u0LXt3Uea_GidxRW1xcsfg,5r6-G9C4YLbC7Ziz57l3rQ,3
10,u0LXt3Uea_GidxRW1xcsfg,z8oIoCT1cXz7gZP5GeU5OA,4
11,u0LXt3Uea_GidxRW1xcsfg,XWTPNfskXoUL-Lf32wSk0Q,3
13,u0LXt3Uea_GidxRW1xcsfg,RtUvSWO_UZ8V3Wpj0n077w,3
14,u0LXt3Uea_GidxRW1xcsfg,Aov96CM4FZAXeZvKtsStdA,5
15,u0LXt3Uea_GidxRW1xcsfg,0W4lkclzZThpx3V65bVgig,4
17,u0LXt3Uea_GidxRW1xcsfg,PFPUMF38-lraKzLcTiz5gQ,3
18,u0LXt3Uea_GidxRW1xcsfg,oWTn2IzrprsRkPfULtjZtQ,3
19,u0LXt3Uea_GidxRW1xcsfg,zgQHtqX0gqMw1nlBZl2VnQ,1


In [54]:
model_1 = model[model.business_id.isin(pop_business_id_list[:100])]
type(model_1)

pandas.core.frame.DataFrame

## User-Item Collaborative Filtering

In [55]:
n_users = model_1.user_id.unique().shape[0]
n_items = model_1.business_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of restaurants = ' + str(n_items))

Number of users = 4999 | Number of restaurants = 100


In [56]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(model_1, test_size=0.25)

In [57]:
#rp = rating.pivot_table(cols=['critic'],rows=['title'],values='rating')
rp = model_1.pivot_table(columns=['business_id'], index = 'user_id', values = 'stars')

In [58]:
rp.head()

business_id,-DtaPJJ-beogNkDKusGhlA,-Ut87cwGFsO3444Rd11p0Q,0s0Xthk1JWrBUu74M81Xyg,0v984vlPQfb4VZWvBJSsfg,1K4qrnfyzKzGgJPBEcJaNQ,1Nq7GxjvEDgAJxBeOjR_9Q,1WBkAuQg81kokZIPMpn9Zg,3b4efqz06QrLQ_w2xLc4pA,4-YKQyhDR2bZ26AYgZ7aFg,575BoZY8tDPcfSRcdsUNLw,...,t8yi2l7pZF43Rlf9_lHdDA,tCJGEQVqjELLYv3fPNX5Rw,tRVx2c89coruPRwYhGTcTw,u29lf2yPd-qK5ThAS9FRQQ,v0byOL8VL6v6muGa1anxFA,veXxt8rGY_RJPpA5QkHT9Q,wtazPNyIbsRMHmrpEYbqZA,ww_MXl-qDLeYdYTZZ9KWbA,wztnDEbXD1VNvHMMPj2tMw,yHCoJZh5nx0Onr5xgMaUHg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--ty7Z9fEt08E3dS3_qoSA,,,,,,,,,,,...,,,,,,,,,,
-0-R267zNIQDPI9rxJlprQ,,,,,,,,,,,...,,,,,,,,,,
-2TWjxYpKd_2N2cdwSg-ZQ,,,4.0,,,,,,,,...,,,,,,,,,,
-2kCxY7_aw5hOz7fJnGMbQ,,,,,,,,,,,...,,,,,,,,,,
-3PTUP443q6hQESLKSu95w,,,,,,,,,,,...,,,,,,,,,,


In [59]:
rp_wide = rp.fillna(0)
rp_wide.head()

business_id,-DtaPJJ-beogNkDKusGhlA,-Ut87cwGFsO3444Rd11p0Q,0s0Xthk1JWrBUu74M81Xyg,0v984vlPQfb4VZWvBJSsfg,1K4qrnfyzKzGgJPBEcJaNQ,1Nq7GxjvEDgAJxBeOjR_9Q,1WBkAuQg81kokZIPMpn9Zg,3b4efqz06QrLQ_w2xLc4pA,4-YKQyhDR2bZ26AYgZ7aFg,575BoZY8tDPcfSRcdsUNLw,...,t8yi2l7pZF43Rlf9_lHdDA,tCJGEQVqjELLYv3fPNX5Rw,tRVx2c89coruPRwYhGTcTw,u29lf2yPd-qK5ThAS9FRQQ,v0byOL8VL6v6muGa1anxFA,veXxt8rGY_RJPpA5QkHT9Q,wtazPNyIbsRMHmrpEYbqZA,ww_MXl-qDLeYdYTZZ9KWbA,wztnDEbXD1VNvHMMPj2tMw,yHCoJZh5nx0Onr5xgMaUHg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--ty7Z9fEt08E3dS3_qoSA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0-R267zNIQDPI9rxJlprQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2TWjxYpKd_2N2cdwSg-ZQ,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2kCxY7_aw5hOz7fJnGMbQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-3PTUP443q6hQESLKSu95w,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
##In this case users represent the rows of the dataframe
pd.Series(rp_wide.index[:10])

0    --ty7Z9fEt08E3dS3_qoSA
1    -0-R267zNIQDPI9rxJlprQ
2    -2TWjxYpKd_2N2cdwSg-ZQ
3    -2kCxY7_aw5hOz7fJnGMbQ
4    -3PTUP443q6hQESLKSu95w
5    -3a9i4F7Xtx63kbXIhtUOw
6    -50XWnmQGqBgEI-9ANvLlg
7    -6559fkJ6rCWIZDbqVUomA
8    -8mGZ-pJi-NcjZckuz1M7A
9    -9JqNce6qGY2fTepA8XpUg
Name: user_id, dtype: object

In [61]:
dists = cosine_similarity(rp_wide)
dists

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [62]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

user_prediction = predict(train_data_matrix, user_similarity, type='user')

NameError: name 'train_data_matrix' is not defined

In [63]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred
item_prediction = predict(train_data_matrix, item_similarity, type='item')
print(item_prediction)

NameError: name 'train_data_matrix' is not defined

In [64]:
sparsity=round(1.0-len(rp)/float(n_users*n_items),3)
print ('The sparsity level of Yelp Data set is ' +  str(sparsity*100) + '%')

The sparsity level of Yelp Data set is 99.0%


In [65]:
rp_wide.head()

business_id,-DtaPJJ-beogNkDKusGhlA,-Ut87cwGFsO3444Rd11p0Q,0s0Xthk1JWrBUu74M81Xyg,0v984vlPQfb4VZWvBJSsfg,1K4qrnfyzKzGgJPBEcJaNQ,1Nq7GxjvEDgAJxBeOjR_9Q,1WBkAuQg81kokZIPMpn9Zg,3b4efqz06QrLQ_w2xLc4pA,4-YKQyhDR2bZ26AYgZ7aFg,575BoZY8tDPcfSRcdsUNLw,...,t8yi2l7pZF43Rlf9_lHdDA,tCJGEQVqjELLYv3fPNX5Rw,tRVx2c89coruPRwYhGTcTw,u29lf2yPd-qK5ThAS9FRQQ,v0byOL8VL6v6muGa1anxFA,veXxt8rGY_RJPpA5QkHT9Q,wtazPNyIbsRMHmrpEYbqZA,ww_MXl-qDLeYdYTZZ9KWbA,wztnDEbXD1VNvHMMPj2tMw,yHCoJZh5nx0Onr5xgMaUHg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--ty7Z9fEt08E3dS3_qoSA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0-R267zNIQDPI9rxJlprQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2TWjxYpKd_2N2cdwSg-ZQ,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2kCxY7_aw5hOz7fJnGMbQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-3PTUP443q6hQESLKSu95w,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
X = rp_wide

In [67]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components = 12, random_state = 42)
matrix = SVD.fit_transform(X)
matrix.shape

(4999, 12)

In [68]:
matrix[:1]

array([[ 1.18230555e-04,  2.83942065e-04, -3.43248378e-03,
        -7.22734578e-03,  1.16273869e-01,  7.66020248e-03,
        -1.71874229e-02,  3.36245781e-02,  1.40011281e-01,
        -1.55246962e-02,  2.90346972e-02,  2.10303836e-02]])

In [69]:
pickle.dump(rp_wide, open('wide_test_matrix.pkl', 'wb'))

In [70]:
pickle.dump(model_1, open('model_before_pivot.pkl', 'wb'))

In [71]:
model_svd = model[model.business_id.isin(pop_business_id_list[:5000])]

In [72]:
pickle.dump(model_svd, open('model_svd.pkl', 'wb'))

## Simlarities, KNN, SVD

In [73]:
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.utils.extmath import randomized_svd

In [75]:
wide_m = pickle.load(open("wide_test_matrix.pkl", "rb"))
df = pickle.load(open("model_before_pivot.pkl", "rb"))
df_pivot = df.pivot(index = 'business_id', columns = 'user_id', values = 'stars').fillna(0)

In [76]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df_pivot)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [77]:
type(df_pivot)

pandas.core.frame.DataFrame

In [78]:
query_index = np.random.choice(df_pivot.shape[0])
distances, indices = model_knn.kneighbors(df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=5)

In [79]:
for i in range(0, len(distances.flatten())):
    if 1 == 0:
        print('Recomendation for {0}:\n'.format(df_pivot.index[query_index]))
    else:
        print('{0}: {1} at a distance of {2}'.format(i, df_pivot.index[indices.flatten()[i]], distances.flatten()))

0: v0byOL8VL6v6muGa1anxFA at a distance of [0.         0.94599591 0.96551044 0.98437746 0.984463  ]
1: u29lf2yPd-qK5ThAS9FRQQ at a distance of [0.         0.94599591 0.96551044 0.98437746 0.984463  ]
2: W1Yr6c2XDx_RBjb6WsV-aQ at a distance of [0.         0.94599591 0.96551044 0.98437746 0.984463  ]
3: _ewxwEwJM-IYfIYnKpQOZw at a distance of [0.         0.94599591 0.96551044 0.98437746 0.984463  ]
4: dPxZI9lrKTl5dvFfnb1_Ig at a distance of [0.         0.94599591 0.96551044 0.98437746 0.984463  ]


In [80]:
SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)

In [81]:
corr = np.corrcoef(matrix)

In [82]:
test_res = 10
corr_test_case = corr[test_res]
list(wide_m[(corr_test_case > .99)])

['-DtaPJJ-beogNkDKusGhlA',
 '-Ut87cwGFsO3444Rd11p0Q',
 '0s0Xthk1JWrBUu74M81Xyg',
 '0v984vlPQfb4VZWvBJSsfg',
 '1K4qrnfyzKzGgJPBEcJaNQ',
 '1Nq7GxjvEDgAJxBeOjR_9Q',
 '1WBkAuQg81kokZIPMpn9Zg',
 '3b4efqz06QrLQ_w2xLc4pA',
 '4-YKQyhDR2bZ26AYgZ7aFg',
 '575BoZY8tDPcfSRcdsUNLw',
 '5BjhQ_Tmm9ppK4UGrMr09g',
 '5XejqzaFmtkZMstJS5Iy-w',
 '5q6Xh-UcJa78bp6dzyaE7w',
 '6PwKC0dqP9uI85FEP86iPQ',
 '75RP4HSsSJOe_e7e2e3jQQ',
 '7J_8O6w79raFRSORc3bRew',
 'A029GQG1S3ekPit6cObcBA',
 'AXb5gCwqHl-_v6ZIMb1mXQ',
 'B5EZlEDH6AVDk8tQGHAGqg',
 'BnuzcebyB1AfxH0kjNWqSg',
 'CyKi4WgsRNpp3jgsxBEcDg',
 'ECqc9Xd0dTdKmcNrbIizAw',
 'EEePiBwchl-TIBVcTdE0RA',
 'FXHfcFVEfI1vVngW2gVOpw',
 'FuO6d7IR1ee-3Bfo-gu1HQ',
 'Gu-xs3NIQTj3Mj2xYoN2aw',
 'IfUkm42SB9D0m_ZbkQ711A',
 'J9f-9Prw2YVM-fiZqv2fmQ',
 'JxSmlL_MckX0dvS5XBg0Cg',
 'KYEZATGRY5aD69ZR6VvyWQ',
 'KhWl4Mwhm_Oqq0zIIz-0wQ',
 'LhWwwDJ2l79a1tHrpcdK6A',
 'LrYSnxLKarkzeNHqq50r-A',
 'M-rqZ56wTGSNoIIsBSlhDA',
 'M3uV9Y3EDSpy9d4YwyNSAQ',
 'MTH-AcNyWfsBa9sXp04HcQ',
 'Mmh4w2g2bSAkdSAFd_MH_g',
 

In [83]:
user_item_mat = wide_m.T

In [84]:
U, Sigma, VT = randomized_svd(user_item_mat.values, 
                              n_components=3,
                              n_iter=5,
                              random_state=None)

In [85]:
pd.DataFrame(VT)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4989,4990,4991,4992,4993,4994,4995,4996,4997,4998
0,4.1e-05,-4.8e-05,-8.702115e-07,-5.23815e-07,3e-06,0.0005101376,1.672934e-07,0.0003060826,8.189151e-05,-2e-06,...,-2e-06,-3e-06,-8.702115e-07,2e-06,-2.76594e-09,-1.5e-05,1.059861e-06,-9.798911e-10,0.05670808,6.2e-05
1,-0.000137,0.000815,4.774746e-06,0.0004549683,0.000111,2.068085e-06,8.739036e-09,1.240851e-06,3.630044e-08,0.000315,...,0.000419,7.2e-05,4.774746e-06,7.4e-05,3.434876e-07,-1e-05,3.904805e-08,-1.501443e-08,5.759196e-07,0.000352
2,-0.000492,0.003325,-0.0001563978,7.780621e-05,2.9e-05,8.613505e-07,0.0001887715,5.168103e-07,3.632428e-07,2.9e-05,...,3.9e-05,5.4e-05,-0.0001563978,2e-05,1.525609e-06,0.000619,-1.882637e-07,-5.128331e-07,6.58201e-07,0.009303


In [86]:
pd.DataFrame(VT.T)

Unnamed: 0,0,1,2
0,4.051291e-05,-1.374908e-04,-4.919943e-04
1,-4.803433e-05,8.150145e-04,3.324754e-03
2,-8.702115e-07,4.774746e-06,-1.563978e-04
3,-5.238150e-07,4.549683e-04,7.780621e-05
4,2.730829e-06,1.106769e-04,2.944686e-05
5,5.101376e-04,2.068085e-06,8.613505e-07
6,1.672934e-07,8.739036e-09,1.887715e-04
7,3.060826e-04,1.240851e-06,5.168103e-07
8,8.189151e-05,3.630044e-08,3.632428e-07
9,-1.657524e-06,3.145367e-04,2.946904e-05


In [87]:
pd.DataFrame(Sigma)

Unnamed: 0,0
0,70.519597
1,62.163824
2,50.030878


In [88]:
def get_recommends(itemID, VT, num_recom=2):
    recs = []
    for item in range(VT.T.shape[0]):
        if item != itemID:
            recs.append([item,np.dot(VT.T[itemID],VT.T[item])])
    final_rec = [i[0] for i in sorted(recs,key=lambda x: x[1],reverse=True)]
    return final_rec[:num_recom]
print(get_recommends(0,VT,num_recom=2))

[1874, 2231]


In [89]:
def get_recommends_user(userID, U, df):
    userrecs = []
    for user in range(U.shape[0]):
        if user!= userID:
            userrecs.append([user,np.dot(U[userID],U[user])])
    final_rec = [i[0] for i in sorted(userrecs,key=lambda x: x[1],reverse=True)]
    comp_user = final_rec[0]
    print("User #%s's most similar user is User #%s "% (userID, comp_user))
    rec_likes = df.iloc[comp_user]
    current = df.iloc[userID]
    recs = []
    for i,item in enumerate(current):
        if item != rec_likes[i] and rec_likes[i]!=0:
            recs.append(i)
    return recs

user_to_rec = 9
print("Items for User %s to check out: "% user_to_rec, get_recommends_user(user_to_rec,U,user_item_mat))

User #9's most similar user is User #82 
Items for User 9 to check out:  [12, 58, 65, 84, 87, 101, 111, 122, 174, 210, 217, 269, 295, 318, 337, 347, 367, 373, 379, 387, 417, 459, 461, 526, 590, 606, 627, 637, 665, 696, 697, 718, 740, 808, 810, 834, 844, 894, 958, 968, 992, 1011, 1048, 1070, 1071, 1072, 1143, 1144, 1208, 1228, 1237, 1238, 1252, 1294, 1299, 1349, 1370, 1384, 1509, 1559, 1560, 1566, 1586, 1618, 1653, 1670, 1685, 1698, 1722, 1743, 1758, 1805, 1817, 1842, 1880, 1899, 1921, 1939, 1968, 1975, 1980, 1987, 2011, 2088, 2089, 2115, 2119, 2178, 2183, 2297, 2331, 2354, 2372, 2385, 2394, 2427, 2433, 2451, 2459, 2511, 2533, 2541, 2556, 2574, 2591, 2648, 2666, 2682, 2687, 2699, 2709, 2735, 2741, 2742, 2783, 2886, 2893, 2895, 2897, 2927, 2931, 2945, 2956, 2998, 3018, 3048, 3076, 3104, 3109, 3133, 3188, 3202, 3228, 3255, 3271, 3288, 3298, 3304, 3315, 3340, 3392, 3415, 3477, 3545, 3546, 3563, 3603, 3629, 3637, 3650, 3726, 3732, 3733, 3799, 3807, 3844, 3859, 3887, 3906, 3911, 3924, 3931, 

In [91]:
model_svd_load = pickle.load(open("model_svd.pkl", "rb"))
model_svd_load.head(1)

Unnamed: 0,user_id,business_id,stars
13,u0LXt3Uea_GidxRW1xcsfg,RtUvSWO_UZ8V3Wpj0n077w,3


In [93]:
model_svd_load_50k = model_svd_load[:50000]
model_svd = model_svd_load_50k.pivot(index = 'user_id', columns = 'business_id', values = 'stars').fillna(0)

In [94]:
U1, Sigma1, VT1 = randomized_svd(model_svd.values, 
                              n_components=3,
                              n_iter=5,
                              random_state=None)

In [95]:
pd.DataFrame(VT1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4760,4761,4762,4763,4764,4765,4766,4767,4768,4769
0,2.744111e-08,1.7e-05,3.8e-05,0.000638,0.022158,4.265612e-05,0.001931,0.00034,0.00105,3.8e-05,...,5.7e-05,0.004302,-1.3562700000000001e-17,0.000344,6.530604e-07,0.036985,0.000198,3.4e-05,1.556516e-06,1.7e-05
1,-5.067108e-08,-8e-06,8e-06,0.000435,0.012749,-9.38607e-07,0.001222,0.000205,0.000537,7e-06,...,-2e-05,0.002608,-6.272543e-16,0.000149,4.369486e-07,0.020489,0.000131,3.5e-05,-1.512444e-06,5e-06
2,-1.152038e-07,-2.7e-05,-5.2e-05,4e-05,-0.013209,4.700205e-05,-0.000371,-9e-06,0.000353,3.9e-05,...,-0.000112,-0.000134,2.122177e-17,0.000145,2.799379e-08,0.019072,-2.1e-05,4.2e-05,-1.738276e-07,-4e-05


In [96]:
def get_recommends(itemID, VT1, num_recom = 3):
    recs = []
    for item in range(VT1.T.shape[0]):
        if item != itemID:
            recs.append([item,np.dot(VT1.T[itemID],VT1.T[item])])
    final_rec = [i[0] for i in sorted(recs,key=lambda x: x[1],reverse=True)]
    return final_rec[:num_recom]
print(get_recommends(0,VT1,num_recom=3))

[4150, 3225, 2880]


In [97]:
def get_recommends_user(userID, U1, df):
    userrecs = []
    for user in range(U1.shape[0]):
        if user!= userID:
            userrecs.append([user,np.dot(U1[userID],U1[user])])
    final_rec = [i[0] for i in sorted(userrecs,key=lambda x: x[1],reverse=True)]
    comp_user = final_rec[0]
    print("User #%s's most similar user is User #%s "% (userID, comp_user))
    rec_likes = df.iloc[comp_user]
    current = df.iloc[userID]
    recs = []
    for i,item in enumerate(current):
        if item != rec_likes[i] and rec_likes[i]!=0:
            recs.append(i)
    return recs

user_to_rec = 3
print("Items for User %s to check out: "% user_to_rec, get_recommends_user(user_to_rec,U1,user_item_mat))

User #3's most similar user is User #15020 


IndexError: single positional indexer is out-of-bounds

In [102]:
##forest
random_forest = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=150, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


random_forest.fit(x_train, y_train)
rf_pred = random_forest.predict(X_test)
print('Accuracy of random forest classifier on test set: {:.2f}'.format(random_forest.score(X_test, y_test)))

rf_conf = confusion_matrix(y_test, rf_pred)
print()

print(rf_conf)
print()
print(classification_report(y_test, rf_pred))

NameError: name 'train_data_matrix' is not defined