<h1>Using Collabortive Filtering Methods to Predict a User's Next Foursquare Check-In to a New Location</h1>
<h2>MS&E 234</h2>

<h3>Setup</h3>

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

Note: The Check-In dataset below was already processed from the 'Global-scale Check-in Dataset with User Social Networks', in order to 1) only consider data points of users who we know the gender of, 2) identify the category of venues using another dataset.

In [2]:
# Check-In Data
check_ins = pd.read_csv('gender-checkins-complete.csv', delimiter='\t', index_col=0)

In [3]:
check_ins.head()

Unnamed: 0,User_ID,Venue_ID,UTC_Time,Timezone_Offset,Lat,Long,Category,Country_Code
0,21939,4dd53b151f6ec4e0bb8c0480,Tue Apr 03 18:00:49 +0000 2012,-240,39.2856,-76.612047,Clothing Store,US
1,163646,4b70040ff964a52080032de3,Tue Apr 03 18:01:31 +0000 2012,-240,25.716845,-80.281378,College Cafeteria,US
2,256534,4b29929cf964a5200fa124e3,Tue Apr 03 18:01:37 +0000 2012,-360,40.726135,-111.852087,American Restaurant,US
3,176836,4b66f88ff964a520eb322be3,Tue Apr 03 18:01:40 +0000 2012,-300,29.661129,-95.115077,Community College,US
4,181560,4bc7086715a7ef3bef9878da,Tue Apr 03 18:02:41 +0000 2012,-240,40.745164,-73.982519,Medical Center,US


### Preprocessing

The following functions reduce the data to users who have made 80+ check-ins in the past.

In [4]:
threshold = 80 # Any user that has less than (threshold) check-ins will be removed.

value_counts = check_ins['User_ID'].value_counts() # Specific column 
to_remove = value_counts[value_counts <= threshold].index
check_ins = check_ins[~check_ins['User_ID'].isin(to_remove)]

In [5]:
# threshold = 2000 # Any user that has more than (threshold) check-ins will be removed.

# value_counts = check_ins[0].value_counts() # Specific column 
# to_remove = value_counts[value_counts >= threshold].index
# check_ins = check_ins[~check_ins[0].isin(to_remove)]

In [6]:
"There are %d check-ins, made by %d users." %(len(check_ins), check_ins['User_ID'].nunique())

'There are 465613 check-ins, made by 2290 users.'

In [7]:
check_ins['Venue_ID'].nunique()

184838

In [8]:
#split into train/test
split = 0.8

X_train = (check_ins.groupby('User_ID',group_keys=False)
        .apply(lambda x: x.nlargest(int(len(x) * split), 'User_ID')))
X_train

Unnamed: 0,User_ID,Venue_ID,UTC_Time,Timezone_Offset,Lat,Long,Category,Country_Code
8621,54,4bb3a86c4019a593e14138b8,Sat Apr 07 23:51:17 +0000 2012,-600,20.648591,-156.442308,Surf Spot,US
14826,54,4c3bf087b36ac928e4850386,Mon Apr 09 23:58:24 +0000 2012,-600,21.008267,-156.556955,Lake,US
21779,54,4c0f13fcd64c0f47b055295d,Thu Apr 12 07:16:01 +0000 2012,-600,20.926547,-156.694711,Japanese Restaurant,US
28025,54,4de0117c45dd3eae8764d6ac,Fri Apr 13 22:48:37 +0000 2012,-420,37.781213,-122.402973,Tech Startup,US
28446,54,49ca8f4df964a520b9581fe3,Sat Apr 14 00:23:43 +0000 2012,-420,37.782464,-122.407823,Coffee Shop,US
38400,54,4de0117c45dd3eae8764d6ac,Mon Apr 16 18:25:45 +0000 2012,-420,37.781213,-122.402973,Tech Startup,US
48284,54,4b54afd9f964a52016c727e3,Fri Apr 20 02:22:17 +0000 2012,-420,37.751640,-122.418508,Hot Dog Joint,US
69073,54,4ef0e7cf7beb5932d5bdeb4e,Wed Apr 25 20:19:38 +0000 2012,-240,40.724169,-73.997211,Tech Startup,US
69999,54,470f3ff0f964a5208e4b1fe3,Thu Apr 26 01:39:01 +0000 2012,-240,40.769519,-73.992584,Concert Hall,US
71365,54,49c224c6f964a520e2551fe3,Thu Apr 26 12:34:15 +0000 2012,-240,40.895019,-73.942581,Office,US


In [9]:
X_test = check_ins[~check_ins.isin(X_train)].dropna()

In [10]:
del check_ins

In [11]:
#feature one, based on user's history: for each venue they visited, get the ratio by number of check ins 
#they've made to that venue divided by total check ins, 
#their top location is the one with the highest ratio
#create cosine similarity between all users
#predicting check-in of never previously visited venue by grabbing the highest frequency 
#location of your closest neighbors
#STEP ONE: Aggregate each user in train, with count

X_train_counts = X_train.groupby(['User_ID','Venue_ID']).size().reset_index(name="Frequency")


In [12]:
Total_Visits = X_train.groupby(['User_ID']).size().reset_index(name="Total_Visits")
X_train_counts = pd.merge(X_train_counts, Total_Visits, on = 'User_ID', how='left', sort = 'False')
del X_train

In [13]:
np.mean(Total_Visits['Total_Visits'])

162.24366812227075

In [14]:
X_train_counts['Adj_Freq'] = X_train_counts['Frequency'] / X_train_counts['Total_Visits'] * 1.0

In [15]:
X_train_counts.head()

Unnamed: 0,User_ID,Venue_ID,Frequency,Total_Visits,Adj_Freq
0,54,3fd66200f964a5204ded1ee3,1,108,0.009259
1,54,3fd66200f964a5209fe61ee3,1,108,0.009259
2,54,40919700f964a520e1f21ee3,1,108,0.009259
3,54,409ad180f964a520eef21ee3,2,108,0.018519
4,54,41059b00f964a520850b1fe3,1,108,0.009259


In [16]:
X_train_counts = X_train_counts.pivot(index='User_ID', columns='Venue_ID', values='Adj_Freq')

In [17]:
X_train_counts.head()

Venue_ID,3fd66200f964a52000e71ee3,3fd66200f964a52000ee1ee3,3fd66200f964a52000f11ee3,3fd66200f964a52001e81ee3,3fd66200f964a52002f01ee3,3fd66200f964a52003e71ee3,3fd66200f964a52003e81ee3,3fd66200f964a52004e61ee3,3fd66200f964a52005e71ee3,3fd66200f964a52005eb1ee3,...,523dd9e7498e092f9a033bb5,52458a1411d2b5993ac0425b,524842f711d2289df8ba9723,524a968f11d2921536d1e87d,524dd87611d2233c88389112,5255b0cb498e1ef25a04735c,5261c77611d2d6de373293f2,526f51e7498e26445f274a5e,528218eb498e0679e73b2490,529d18fb11d26ddfd0f8152d
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,,,,,,,,,,,...,,,,,,,,,,
182,,,,,,,,,,,...,,,,,,,,,,
346,,,,,,,,,,,...,0.005263,,,,,,,,,
419,,,,,,,,,,,...,,,,0.014706,,,,,,
541,,,,,,,,,,,...,,,,,,,,,,


In [18]:
X_train_counts = X_train_counts.fillna(0)

In [19]:
X_train_counts.head()

Venue_ID,3fd66200f964a52000e71ee3,3fd66200f964a52000ee1ee3,3fd66200f964a52000f11ee3,3fd66200f964a52001e81ee3,3fd66200f964a52002f01ee3,3fd66200f964a52003e71ee3,3fd66200f964a52003e81ee3,3fd66200f964a52004e61ee3,3fd66200f964a52005e71ee3,3fd66200f964a52005eb1ee3,...,523dd9e7498e092f9a033bb5,52458a1411d2b5993ac0425b,524842f711d2289df8ba9723,524a968f11d2921536d1e87d,524dd87611d2233c88389112,5255b0cb498e1ef25a04735c,5261c77611d2d6de373293f2,526f51e7498e26445f274a5e,528218eb498e0679e73b2490,529d18fb11d26ddfd0f8152d
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.014706,0.0,0.0,0.0,0.0,0.0,0.0
541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
sim_scores = cosine_similarity(X_train_counts)

In [21]:
X_train_counts.shape

(2290, 153078)

In [22]:
#use nearest 3 neighbors
sim_scores.shape

(2290, 2290)

In [23]:
sim_scores

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

FOR EACH USER (X):

1- Use the cosine similarity matrix above to get top 5 most similar users to (X)

2- Refer to top 5 users similar to the user (X) as: u1, u2, u3, u4, u5

3- For every location L1, L2, L3.., calculate the probabilty of user X visiting location L as = (L*sim(u1) + L*sim(u2) ... + L*sim(u5))/ (sim(u1)+sim(u2)...+sim(u5))

4- Sort probabilities from high to low, use top 1 as prediction

In [24]:
sim_scores[0].shape

(2290,)

## 1- Use the cosine similarity matrix above to get top 5 most similar users to (X)

In [37]:
np.argsort(sim_scores[0], axis=0)

array([1144, 1507, 1506, ...,  653, 1230,    0])

In [38]:
# Sort using cosine_sim, return the index of the values
sorted_sim = np.argsort(sim_scores, axis=1)
sorted_sim

array([[1144, 1507, 1506, ...,  653, 1230,    0],
       [   0, 1530, 1529, ...,  690, 1446,    1],
       [   0, 1532, 1531, ...,  192,  826,    2],
       ...,
       [   0, 1528, 1527, ..., 1998, 1792, 2287],
       [   0, 1533, 1532, ...,  176, 1742, 2288],
       [   0, 1533, 1532, ...,  489, 1922, 2289]])

In [40]:
# The indices of the most similiar users to user 0 (use to lookup actual u_id in X_train_counts)
sorted_sim[0][-6:-1]

array([ 148,  953,  340,  653, 1230])

In [75]:
sim_scores[0].take(sorted_sim[0][-6:-1])
#Okay, those are the scores for item 0, but let's get them for all users

array([0.03472911, 0.03490525, 0.0414371 , 0.04825264, 0.12707131])

In [97]:
# The mapping of index to User_ID
user_IDs = X_train_counts.index.values
user_IDs

array([    54,    182,    346, ..., 266416, 266654, 266701])

In [105]:
top_n_idx = np.array([])
top_n_userIDs = []
top_n_scores = []
for i in range(len(sorted_sim)):
    top_n_idx.append(sorted_sim[i][-6:-1])
    top_n_userIDs.append(user_IDs.take(sorted_sim[i][-6:-1])) #Inefficient
    top_n_scores.append(sim_scores[i].take(sorted_sim[i][-6:-1]))

AttributeError: 'numpy.ndarray' object has no attribute 'append'

now: 
 * `top_n_idx` contains the ids of most similiar users (in the sort_sim array)
 * `top_n_userIDs` contains the actual User Ids of top 5 most similiar users (in ascending order)
 * `top_n_scores` contains the similiarity scores of top 5 most similiar users

## 3- For every location L1, L2, L3.., calculate the probabilty of user X visiting location L as = (Lsim(u1) + Lsim(u2) ... + L*sim(u5))/ (sim(u1)+sim(u2)...+sim(u5))

In [107]:
X_train_counts.head()

Venue_ID,3fd66200f964a52000e71ee3,3fd66200f964a52000ee1ee3,3fd66200f964a52000f11ee3,3fd66200f964a52001e81ee3,3fd66200f964a52002f01ee3,3fd66200f964a52003e71ee3,3fd66200f964a52003e81ee3,3fd66200f964a52004e61ee3,3fd66200f964a52005e71ee3,3fd66200f964a52005eb1ee3,...,523dd9e7498e092f9a033bb5,52458a1411d2b5993ac0425b,524842f711d2289df8ba9723,524a968f11d2921536d1e87d,524dd87611d2233c88389112,5255b0cb498e1ef25a04735c,5261c77611d2d6de373293f2,526f51e7498e26445f274a5e,528218eb498e0679e73b2490,529d18fb11d26ddfd0f8152d
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0052631578947368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0147058823529411,0.0,0.0,0.0,0.0,0.0,0.0
541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
# Let's take Location #0
X_train_counts.iloc[:,0].values

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
# the probabilty of user 0 visiting location L as = 
# (L*sim(u1) + L*sim(u2) ... + L*sim(u5))/ (sim(u1)+sim(u2)...+sim(u5))

In [113]:
c = X_train_counts.values
c

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [114]:
u_index = 0
l_index = 0 
L = c[u_index, l_index]
L

0.0

In [None]:

# L comes from X_train_counts (index: [u_index, l_index])
# sim(u1, u2, u3...etc) come from top_n_scores index by u_index
X_train_counts.iloc[:,0].values.take()

In [103]:
np.array(top_n_scores)

array([[0.03472911, 0.03490525, 0.0414371 , 0.04825264, 0.12707131],
       [0.0758062 , 0.07912026, 0.09640683, 0.10012267, 0.17867358],
       [0.00141324, 0.0019728 , 0.00235619, 0.00254678, 0.00259541],
       ...,
       [0.0151608 , 0.01546104, 0.01687436, 0.02078478, 0.0243127 ],
       [0.00120551, 0.00226075, 0.00229389, 0.00258971, 0.00352135],
       [0.00392546, 0.0040032 , 0.0041965 , 0.0060372 , 0.01060475]])

In [86]:
top_n_idx[1]

array([ 443,  780,  215,  690, 1446])

In [87]:
top_n_scores[1]

array([0.0758062 , 0.07912026, 0.09640683, 0.10012267, 0.17867358])

## 2- Refer to top 5 users similar to the user (X) as: u1, u2, u3, u4, u5

In [90]:
X_train_counts.iloc[sorted_sim[0][-6:-1]]

Venue_ID,3fd66200f964a52000e71ee3,3fd66200f964a52000ee1ee3,3fd66200f964a52000f11ee3,3fd66200f964a52001e81ee3,3fd66200f964a52002f01ee3,3fd66200f964a52003e71ee3,3fd66200f964a52003e81ee3,3fd66200f964a52004e61ee3,3fd66200f964a52005e71ee3,3fd66200f964a52005eb1ee3,...,523dd9e7498e092f9a033bb5,52458a1411d2b5993ac0425b,524842f711d2289df8ba9723,524a968f11d2921536d1e87d,524dd87611d2233c88389112,5255b0cb498e1ef25a04735c,5261c77611d2d6de373293f2,526f51e7498e26445f274a5e,528218eb498e0679e73b2490,529d18fb11d26ddfd0f8152d
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
# Retrieve u5, u4, u3, u2, u1 for user #0
X_train_counts.iloc[sorted_sim[0][-6:-1]].index

Int64Index([12779, 98605, 31236, 65026, 132245], dtype='int64', name='User_ID')

In [56]:
pd.options.display.float_format = '{:,.20f}'.format

In [57]:
# Let's do it for L1 for now.
X_train_counts.iloc[:,0]

User_ID
54       0.00000000000000000000
182      0.00000000000000000000
346      0.00000000000000000000
419      0.00000000000000000000
541      0.00000000000000000000
545      0.00000000000000000000
562      0.00000000000000000000
799      0.00000000000000000000
884      0.00000000000000000000
889      0.00000000000000000000
975      0.00000000000000000000
992      0.00000000000000000000
1009     0.00000000000000000000
1067     0.00000000000000000000
1167     0.00000000000000000000
1258     0.00000000000000000000
1268     0.00000000000000000000
1341     0.00000000000000000000
1437     0.00000000000000000000
1472     0.00000000000000000000
1509     0.00000000000000000000
1605     0.00000000000000000000
1625     0.00000000000000000000
1662     0.00000000000000000000
1743     0.00000000000000000000
1780     0.00000000000000000000
1782     0.00000000000000000000
2087     0.00000000000000000000
2092     0.00000000000000000000
2197     0.00000000000000000000
                  ...          
