<h1>Matrix Factorization for Check-In Prediction</h1>
<h2>MS&E 234</h2>

<h3>Setup</h3>

We import necessary libraries and read in our dataset.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

The check-in dataset below is modified from the original check-in dataset to 1) limit data points to users with known genders and 2) include venue category.

In [2]:
check_ins = pd.read_csv('gender-checkins-complete.csv', delimiter='\t', index_col=0)

In [3]:
check_ins.head()

Unnamed: 0,User_ID,Venue_ID,UTC_Time,Timezone_Offset,Lat,Long,Category,Country_Code
0,21939,4dd53b151f6ec4e0bb8c0480,Tue Apr 03 18:00:49 +0000 2012,-240,39.2856,-76.612047,Clothing Store,US
1,163646,4b70040ff964a52080032de3,Tue Apr 03 18:01:31 +0000 2012,-240,25.716845,-80.281378,College Cafeteria,US
2,256534,4b29929cf964a5200fa124e3,Tue Apr 03 18:01:37 +0000 2012,-360,40.726135,-111.852087,American Restaurant,US
3,176836,4b66f88ff964a520eb322be3,Tue Apr 03 18:01:40 +0000 2012,-300,29.661129,-95.115077,Community College,US
4,181560,4bc7086715a7ef3bef9878da,Tue Apr 03 18:02:41 +0000 2012,-240,40.745164,-73.982519,Medical Center,US


We will choose the category to predict: specific venue or venue category.

In [4]:
category = 'Category'  # 'Venue_ID'

### Preprocessing

The following functions reduce the data to users who have made 80+ check-ins in the past.

In [5]:
threshold = 80 # Any user that has less than (threshold) check-ins will be removed.

value_counts = check_ins['User_ID'].value_counts() # Specific column 
to_remove = value_counts[value_counts <= threshold].index
check_ins = check_ins[~check_ins['User_ID'].isin(to_remove)]

In [6]:
"There are %d check-ins, made by %d users." %(len(check_ins), check_ins['User_ID'].nunique())

'There are 465613 check-ins, made by 2290 users.'

In [7]:
check_ins[category].nunique()

433

### Data Split

In [8]:
split = 0.8

X_train = (check_ins.groupby('User_ID', group_keys=False).apply(lambda x: x.nlargest(int(len(x) * split), 'User_ID')))
X_train.head()

Unnamed: 0,User_ID,Venue_ID,UTC_Time,Timezone_Offset,Lat,Long,Category,Country_Code
8621,54,4bb3a86c4019a593e14138b8,Sat Apr 07 23:51:17 +0000 2012,-600,20.648591,-156.442308,Surf Spot,US
14826,54,4c3bf087b36ac928e4850386,Mon Apr 09 23:58:24 +0000 2012,-600,21.008267,-156.556955,Lake,US
21779,54,4c0f13fcd64c0f47b055295d,Thu Apr 12 07:16:01 +0000 2012,-600,20.926547,-156.694711,Japanese Restaurant,US
28025,54,4de0117c45dd3eae8764d6ac,Fri Apr 13 22:48:37 +0000 2012,-420,37.781213,-122.402973,Tech Startup,US
28446,54,49ca8f4df964a520b9581fe3,Sat Apr 14 00:23:43 +0000 2012,-420,37.782464,-122.407823,Coffee Shop,US


In [9]:
X_test = check_ins[~check_ins.isin(X_train)].dropna()

### Ratings Matrix Creation

In [10]:
R_df = X_train.groupby(['User_ID', category]).size().reset_index(name="Frequency")

In [11]:
Total_Visits = X_train.groupby(['User_ID']).size().reset_index(name="Total_Visits")
R_df = pd.merge(R_df, Total_Visits, on = 'User_ID', how='left', sort = 'False')

In [12]:
np.mean(Total_Visits['Total_Visits'])

162.24366812227075

In [13]:
R_df['Adj_Freq'] = R_df['Frequency'] / R_df['Total_Visits'] * 1.0

In [14]:
R_df.head()

Unnamed: 0,User_ID,Category,Frequency,Total_Visits,Adj_Freq
0,54,Airport,5,108,0.046296
1,54,American Restaurant,1,108,0.009259
2,54,Bar,1,108,0.009259
3,54,Beach,2,108,0.018519
4,54,Breakfast Spot,1,108,0.009259


In [15]:
R_df = R_df.pivot(index='User_ID', columns=category, values='Adj_Freq').fillna(0)

In [16]:
R_df.head()

Category,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,Airport Tram,American Restaurant,...,Well,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Yogurt,Zoo
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,0.0,0.0,0.0,0.046296,0.0,0.0,0.0,0.0,0.0,0.009259,...,0.0,0.0,0.0,0.0,0.009259,0.0,0.0,0.0,0.0,0.0
182,0.0,0.0,0.0,0.0125,0.0,0.0,0.0,0.0125,0.0,0.0125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0125
346,0.0,0.0,0.0,0.010526,0.0,0.0,0.0,0.0,0.0,0.015789,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
419,0.0,0.0,0.0,0.014706,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
541,0.0,0.0,0.0,0.072289,0.0,0.0,0.0,0.006024,0.0,0.03012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
R = R_df.as_matrix()

  """Entry point for launching an IPython kernel.


### Singular Value Decomposition

In [18]:
from scipy.sparse.linalg import svds

U, sigma, Vt = svds(R, k = 50)

In [19]:
sigma = np.diag(sigma)

In [20]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [21]:
preds_df.head()

Category,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,Airport Tram,American Restaurant,...,Well,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Yogurt,Zoo
0,7.4e-05,-4e-06,5.9e-05,0.046695,-8e-06,0.001405,0.000426,0.004053,5.9e-05,0.016338,...,0.000928,0.000394,0.001497,0.000341,0.000496,0.000809,8.7e-05,0.00039,8e-06,0.000531
1,0.000275,-7e-06,-2.3e-05,0.012758,4e-05,0.002067,0.000696,0.006647,7.6e-05,0.012332,...,-6.2e-05,0.000617,0.000658,-6e-05,-0.000355,0.000611,-0.000126,0.000988,-5e-06,0.001534
2,0.000144,-5e-06,1e-05,0.009717,1e-06,3.5e-05,-0.000361,0.003012,1.8e-05,0.017625,...,4e-06,9.4e-05,0.001182,0.000928,0.001133,0.001521,0.000856,0.000125,-2.3e-05,0.000502
3,-6.8e-05,-6e-06,0.000331,0.014919,-3e-06,0.000183,-0.000699,0.002398,-5.8e-05,5.1e-05,...,0.000446,-5.4e-05,0.001217,6e-06,-0.000375,0.000472,0.000987,0.000567,3.7e-05,0.000564
4,4.1e-05,-8e-06,-8.2e-05,0.071913,8e-06,0.003099,0.002305,0.008484,0.000179,0.026015,...,-0.000141,0.000195,0.00111,0.000266,0.001123,0.003105,-7.6e-05,0.000466,-3e-06,0.000452


In [22]:
def predict_check_ins(preds_df, num_recommendations=1):
    correct = 0
    for row in preds_df.index:
        preds = preds_df.iloc[row].sort_values(ascending=False)[0:num_recommendations]
        X_test_rows = X_test[X_test['User_ID'] == R_df.index[row]]
        X_test_rows = X_test_rows[X_test_rows[category].isin(preds.index)]
        if not X_test_rows.empty: correct += 1
    accuracy = correct / len(preds_df.index)
    return accuracy

acc = predict_check_ins(preds_df)

In [23]:
acc

0.8362445414847162

### Analysis