<h1>Matrix Factorization for Check-In Prediction</h1>
<h2>MS&E 234</h2>

<h3>Setup</h3>

We import necessary libraries and read in our dataset.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

The check-in dataset below is modified from the original check-in dataset to 1) limit data points to users with known genders and 2) include venue category.

In [2]:
check_ins = pd.read_csv('gender-checkins-complete.csv', delimiter='\t', index_col=0)
check_ins.head()

Unnamed: 0,User_ID,Venue_ID,UTC_Time,Timezone_Offset,Lat,Long,Category,Country_Code
0,21939,4dd53b151f6ec4e0bb8c0480,Tue Apr 03 18:00:49 +0000 2012,-240,39.2856,-76.612047,Clothing Store,US
1,163646,4b70040ff964a52080032de3,Tue Apr 03 18:01:31 +0000 2012,-240,25.716845,-80.281378,College Cafeteria,US
2,256534,4b29929cf964a5200fa124e3,Tue Apr 03 18:01:37 +0000 2012,-360,40.726135,-111.852087,American Restaurant,US
3,176836,4b66f88ff964a520eb322be3,Tue Apr 03 18:01:40 +0000 2012,-300,29.661129,-95.115077,Community College,US
4,181560,4bc7086715a7ef3bef9878da,Tue Apr 03 18:02:41 +0000 2012,-240,40.745164,-73.982519,Medical Center,US


We will choose the category to predict: specific venue or venue category.

In [3]:
category = 'Venue_ID'  # 'Venue_ID' or 'Category'

We now read in our user profile dataset for gender.

In [4]:
user_profiles = 'dataset_UbiComp2016/dataset_UbiComp2016_UserProfile_NYC.txt'
genders = pd.read_csv(user_profiles, usecols=[0, 1], delimiter='\t', header=None, names=['User_ID', 'Gender']).set_index('User_ID')
genders.head()

Unnamed: 0_level_0,Gender
User_ID,Unnamed: 1_level_1
15861,male
235138,male
143801,male
128426,male
134292,female


### Preprocessing

The following functions reduce the data to users who have made 80+ check-ins in the past.

In [5]:
threshold = 80 # Any user that has less than (threshold) check-ins will be removed.

value_counts = check_ins['User_ID'].value_counts() # Specific column 
to_remove = value_counts[value_counts <= threshold].index
check_ins = check_ins[~check_ins['User_ID'].isin(to_remove)]

In [6]:
"There are %d check-ins, made by %d users." %(len(check_ins), check_ins['User_ID'].nunique())

'There are 465613 check-ins, made by 2290 users.'

In [7]:
check_ins[category].nunique()

184838

### Data Split

In [8]:
split = 0.8

X_train = (check_ins.groupby('User_ID', group_keys=False).apply(lambda x: x.nlargest(int(len(x) * split), 'User_ID')))
X_train.head()

Unnamed: 0,User_ID,Venue_ID,UTC_Time,Timezone_Offset,Lat,Long,Category,Country_Code
8621,54,4bb3a86c4019a593e14138b8,Sat Apr 07 23:51:17 +0000 2012,-600,20.648591,-156.442308,Surf Spot,US
14826,54,4c3bf087b36ac928e4850386,Mon Apr 09 23:58:24 +0000 2012,-600,21.008267,-156.556955,Lake,US
21779,54,4c0f13fcd64c0f47b055295d,Thu Apr 12 07:16:01 +0000 2012,-600,20.926547,-156.694711,Japanese Restaurant,US
28025,54,4de0117c45dd3eae8764d6ac,Fri Apr 13 22:48:37 +0000 2012,-420,37.781213,-122.402973,Tech Startup,US
28446,54,49ca8f4df964a520b9581fe3,Sat Apr 14 00:23:43 +0000 2012,-420,37.782464,-122.407823,Coffee Shop,US


In [9]:
X_test = check_ins[~check_ins.isin(X_train)].dropna()

### Ratings Matrix Creation

In [10]:
R_df = X_train.groupby(['User_ID', category]).size().reset_index(name="Frequency")

In [11]:
Total_Visits = X_train.groupby(['User_ID']).size().reset_index(name="Total_Visits")
R_df = pd.merge(R_df, Total_Visits, on = 'User_ID', how='left', sort = 'False')

In [12]:
np.mean(Total_Visits['Total_Visits'])

162.24366812227075

In [13]:
R_df['Adj_Freq'] = R_df['Frequency'] / R_df['Total_Visits'] * 1.0

In [14]:
R_df.head()

Unnamed: 0,User_ID,Venue_ID,Frequency,Total_Visits,Adj_Freq
0,54,3fd66200f964a5204ded1ee3,1,108,0.009259
1,54,3fd66200f964a5209fe61ee3,1,108,0.009259
2,54,40919700f964a520e1f21ee3,1,108,0.009259
3,54,409ad180f964a520eef21ee3,2,108,0.018519
4,54,41059b00f964a520850b1fe3,1,108,0.009259


In [15]:
R_df = R_df.pivot(index='User_ID', columns=category, values='Adj_Freq').fillna(0)

In [16]:
R_df.head()

Venue_ID,3fd66200f964a52000e71ee3,3fd66200f964a52000ee1ee3,3fd66200f964a52000f11ee3,3fd66200f964a52001e81ee3,3fd66200f964a52002f01ee3,3fd66200f964a52003e71ee3,3fd66200f964a52003e81ee3,3fd66200f964a52004e61ee3,3fd66200f964a52005e71ee3,3fd66200f964a52005eb1ee3,...,523dd9e7498e092f9a033bb5,52458a1411d2b5993ac0425b,524842f711d2289df8ba9723,524a968f11d2921536d1e87d,524dd87611d2233c88389112,5255b0cb498e1ef25a04735c,5261c77611d2d6de373293f2,526f51e7498e26445f274a5e,528218eb498e0679e73b2490,529d18fb11d26ddfd0f8152d
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.014706,0.0,0.0,0.0,0.0,0.0,0.0
541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
R = R_df.as_matrix()

  """Entry point for launching an IPython kernel.


### Singular Value Decomposition

In [None]:
from scipy.sparse.linalg import svds

U, sigma, Vt = svds(R, k = 2000)  # Can replace k with different values.

In [40]:
sigma = np.diag(sigma)

In [41]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [42]:
preds_df.head()

Venue_ID,3fd66200f964a52000e71ee3,3fd66200f964a52000ee1ee3,3fd66200f964a52000f11ee3,3fd66200f964a52001e81ee3,3fd66200f964a52002f01ee3,3fd66200f964a52003e71ee3,3fd66200f964a52003e81ee3,3fd66200f964a52004e61ee3,3fd66200f964a52005e71ee3,3fd66200f964a52005eb1ee3,...,523dd9e7498e092f9a033bb5,52458a1411d2b5993ac0425b,524842f711d2289df8ba9723,524a968f11d2921536d1e87d,524dd87611d2233c88389112,5255b0cb498e1ef25a04735c,5261c77611d2d6de373293f2,526f51e7498e26445f274a5e,528218eb498e0679e73b2490,529d18fb11d26ddfd0f8152d
0,2.204474e-05,-1.437689e-06,1.58728e-05,4.943619e-05,0.0001167933,-2.61061e-06,-4.353868e-05,-1.404628e-05,1.980355e-06,-2.4e-05,...,-2.322132e-08,1.106068e-18,1.872472e-09,-1.282256e-07,7.039888e-08,-6.384596e-07,-1.798347e-08,1.216149e-07,-1.036015e-05,2.432297e-07
1,-5.143999e-06,1.201463e-06,-5.25006e-06,7.723282e-06,-1.527678e-06,1.780849e-07,-3.259251e-05,-6.562665e-06,-1.163035e-07,-1.5e-05,...,8.795904e-09,5.895968e-19,1.487384e-09,-3.795065e-07,3.441375e-09,7.827405e-08,-2.686871e-09,2.43491e-08,2.42686e-07,4.86982e-08
2,1.140327e-07,1.636746e-07,-2.233803e-08,9.56932e-08,-5.367945e-07,-3.145709e-09,-4.97206e-09,-5.779243e-07,4.89531e-09,-1e-06,...,0.005263156,6.814197999999999e-19,-1.151399e-11,-1.00053e-09,2.643362e-10,-3.629568e-10,3.387661e-11,8.227292e-11,-2.132442e-09,1.645458e-10
3,4.449409e-07,-1.381218e-06,3.455196e-07,1.829117e-06,2.933219e-06,1.060649e-07,1.892362e-06,4.799406e-07,1.717562e-08,1e-06,...,-3.580844e-10,-5.076716e-19,6.090642e-10,0.01470554,-3.609058e-09,-5.282172e-08,-2.916882e-10,1.585508e-09,-5.454237e-07,3.171017e-09
4,-1.135357e-06,5.770844e-06,1.143056e-05,4.874111e-06,-1.323835e-06,-9.941994e-08,1.192101e-05,-5.579878e-06,1.496867e-07,-1.6e-05,...,-5.777954e-09,1.099269e-19,-2.675524e-09,3.549306e-08,2.616095e-09,-1.623641e-08,-1.606976e-08,6.536105e-08,6.523073e-07,1.307221e-07


In [43]:
# Mask predictions that were already visited.
# mask = R_df.reset_index().drop(columns=['User_ID']) != 0
# preds = preds_df.mask(mask, other=float('-inf'))

In [44]:
mrr_df = pd.DataFrame(index=list(range(10)), columns=['n', 'score', 'gender', 'method']).fillna(0)

In [45]:
def predict_check_ins(preds_df, num_recommendations=5):
    correct = 0
    gender_counts = {'male': 0, 'female': 0}
    mrrs = {'male' : np.zeros(num_recommendations), 'female' : np.zeros(num_recommendations)}
    for row in preds_df.index:
        user_id = R_df.index[row]
        gender = genders.loc[user_id].at['Gender']
        gender_counts[gender] += 1
        preds_row = preds_df.iloc[row].sort_values(ascending=False)[0:num_recommendations]
        X_test_rows = X_test[X_test['User_ID'] == user_id]
        if not X_test_rows[X_test_rows[category].isin([preds_row.index[0]])].empty: correct += 1
        for i in range(len(preds_row.index)):
            if not X_test_rows[X_test_rows[category].isin([preds_row.index[i]])].empty:
                for j in range(i, len(preds_row.index)): mrrs[gender][j] += (1.0 / (i + 1))
                break
    print('Accuracy: ', correct / len(preds_df.index))
    return mrrs['male'] / gender_counts['male'], mrrs['female'] / gender_counts['female']

In [46]:
male_mrr, female_mrr = predict_check_ins(preds_df)
for i in range(male_mrr.shape[0]):
    mrr_df.loc[(2 * i)] = pd.Series({'n': i + 1, 'score': male_mrr[i], 'gender': 'male', 'method': 'model-based'})
    mrr_df.loc[(2 * i) + 1] = pd.Series({'n': i + 1, 'score': female_mrr[i], 'gender': 'female', 'method': 'model-based'})
mrr_df

Accuracy:  0.6471615720524018


Unnamed: 0,n,score,gender,method
0,1,0.645833,male,model-based
1,1,0.649626,female,model-based
2,2,0.710685,male,model-based
3,2,0.710723,female,model-based
4,3,0.727487,male,model-based
5,3,0.724023,female,model-based
6,4,0.735719,male,model-based
7,4,0.732751,female,model-based
8,5,0.738945,male,model-based
9,5,0.735744,female,model-based


In [47]:
mrr_df.to_csv('matrix_factorization_MRR.csv')

### Acknowledgements
Much of this code is adapted from [Nick Becker's Matrix Factorization implementation](https://github.com/beckernick/matrix_factorization_recommenders).