# 5. SVD

In [3]:
# imports

import pandas as pd
import numpy as np
import json
import gzip

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split

from sklearn.metrics import root_mean_squared_error

In [4]:
# define file paths

path_reviews_subset = 'data/subset_reviews.parquet'
path_meta = 'original_data/meta-Utah.json.gz'

In [5]:
# define a function for reading the data using a generator

def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [6]:
# import the reviews filtered by the 10,001 users in the subset

reviews = pd.read_parquet(path_reviews_subset)
reviews

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,116427980967433332299,Amanda Tapp,1501002398116,5,"Extremely easy to work with auto loans, great ...",,{'text': 'We are glad to hear you had such a w...,0x87528767d0ec0e4d:0x7a2f1637a6fb6925
1,100217552787897641896,Rylee Jones,1566432339019,5,,,,0x875287dced2a1f1f:0xf3764a9211d4f382
2,113719864440680408253,Kylie McDonald,1559939531153,5,Highly recommend this business! They truly car...,,"{'text': 'Thank you, Kylie! It was a pleasure ...",0x8752841a66574037:0x6a51c0f67ca3002
3,114710026425309062285,Jess Bird,1540481859746,5,,,"{'text': 'Thank you, Jess!', 'time': 159667147...",0x8752f365da14f295:0x264218c77da46a71
4,114105421795834263422,Maxwell McLeod,1544839018564,5,Tom and the team do an excellent job. I have b...,,,0x875287d36010a61b:0xab575aa5992155b3
...,...,...,...,...,...,...,...,...
305257,107626136867067342591,Rachelle Taysom,1525060448816,3,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
305258,108155158391983335470,Cafe Guru,1603655474608,5,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
305259,104620742288190585924,Sean Smith,1578929371061,2,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
305260,102429264321348600901,Eric Anderson,1564336239651,5,,,,0x875287112eb7ef65:0x3aefb61bbcb5f2a9


In [7]:
# train-test-split

train, test = train_test_split(reviews, test_size=0.2, random_state=42)
train.shape, test.shape

((244209, 8), (61053, 8))

In [8]:
# create 2 dataframes, for train and test, where the rows and columns are the same in both dataframes
# the rows are all the unique users in reviews and the columns are all the unique items in reviews

train_ratings = pd.DataFrame(index=reviews.user_id.unique(), columns=reviews.gmap_id.unique())
test_ratings = train_ratings.copy()

In [9]:
# fill the train dataframe row by row by using a loop

for i in range(len(train)):
  train_ratings.loc[train['user_id'].iloc[i], train['gmap_id'].iloc[i]] = train['rating'].iloc[i]

In [10]:
# fill the test dataframe row by row by using a loop

for i in range(len(test)):
  test_ratings.loc[train['user_id'].iloc[i], train['gmap_id'].iloc[i]] = train['rating'].iloc[i]

In [11]:
# make sure train and test are the same size

train_ratings.shape, test_ratings.shape

((10001, 29040), (10001, 29040))

- I do not want to fill in the NaN's will 0's, because it will bias the ratings. Instead, I will fill it with the mean.
- Before filling with the mean, I will first de-mean it, which will make the mean 0. That way, filling it with 0 is the same as filling with the mean, which will also allow me to keep the data sparse. Sparse data will squeeze out the 0's and only keep the ones with values.
- To de-mean, I will use the mean of the non-NaN values.

In [12]:
# calculate the mean of the non-NaN values
userMean = np.nanmean(train_ratings)
print(f'The mean is {userMean}')

# demean the train data
train_demeaned = train_ratings - userMean

# check to make sure the resulting mean is 0
np.nanmean(train_demeaned)

The mean is 4


0

In [13]:
# fill the train NaN's with 0

train_demeaned = train_demeaned.fillna(0).astype(float)

  train_demeaned = train_demeaned.fillna(0).astype(float)


In [14]:
# make the data sparse

train_sparse = csr_matrix(train_demeaned)

In [15]:
# matrix factorization using svds

U, sigma, Vt = svds(train_sparse, k=50)

# diagonalize the sigma

sigma = np.diag(sigma)

In [16]:
# get the predictions by taking 2 dot products in succession
# 1. dot product of U and sigma
# 2. dot product of the above and Vt

predicted_ratings = U @ sigma @ Vt

In [17]:
# add back the mean

predicted_ratings += userMean

# turn it into a dataframe

predicted_df = pd.DataFrame(predicted_ratings, columns=train_ratings.columns, index=train_ratings.index)

In [18]:
# use clip to clip anything lower than 1 to 1, and anything higher than 5 to 5

predicted_df = np.clip(predicted_df, 1, 5)

In [19]:
# check the predictions

predicted_df

Unnamed: 0,0x87528767d0ec0e4d:0x7a2f1637a6fb6925,0x875287dced2a1f1f:0xf3764a9211d4f382,0x8752841a66574037:0x6a51c0f67ca3002,0x8752f365da14f295:0x264218c77da46a71,0x875287d36010a61b:0xab575aa5992155b3,0x87528440b7ee298f:0x525424c7bbad1c2c,0x80ca44f91c78df35:0x4de85c05e1117565,0x874d9bf9d1db7c85:0xf1c3706a2b497a3,0x875261e8b5a75a37:0x9d82b30caf13f89d,0x874d907b753aa797:0x1b835764d4684400,...,0x80cb298797a23155:0x682d89a022c5d033,0x8734d418624bd435:0x7d7e27921054e85e,0x80caf3e4ff8f7bcd:0x93be0c0a28cb63ce,0x8734f86c18c4aa2d:0xfa57972b3f6d329a,0x80ca44998390637d:0x12a7f30929faf885,0x874d909522cb134d:0x9c7d37589ba82b27,0x87528714131dc5e1:0x57130d869abea558,0x80ca453710952cab:0x70a40bca02898341,0x874ebbe4a8181e5b:0xb5afc0f672d353ef,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
116427980967433332299,4.000003,3.999990,4.000001,4.000000,4.000118,4.0,4.0,4.0,4.000004,4.000000,...,4.000038,4.000099,3.999953,4.000027,4.000008,4.000024,4.000994,4.000267,3.999940,4.001817
100217552787897641896,3.999990,4.000508,4.000005,3.999982,4.000829,4.0,4.0,4.0,4.000015,3.999944,...,4.001108,3.999222,4.001699,3.999969,4.000230,3.996648,3.996059,3.999675,4.001826,3.987627
113719864440680408253,4.000001,4.000005,4.000004,4.000003,4.000116,4.0,4.0,4.0,3.999995,4.000003,...,4.000358,3.999926,4.000824,4.000028,4.000271,3.999992,3.999744,4.000590,3.999861,4.000631
114710026425309062285,4.000000,3.999982,4.000003,4.000043,4.000292,4.0,4.0,4.0,4.000048,3.999999,...,4.000222,3.999703,3.999966,4.000009,4.000050,4.000087,4.000465,4.000803,4.000194,3.999117
114105421795834263422,4.000000,4.000400,4.000036,4.000050,4.005781,4.0,4.0,4.0,4.000127,3.999956,...,4.003368,3.998631,3.997162,4.000474,3.999757,3.996051,4.000010,4.003196,3.995960,3.989056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105646211836738285046,4.000001,4.000043,4.000022,3.999998,3.999519,4.0,4.0,4.0,3.999980,4.000043,...,4.004075,3.999946,4.012891,4.000278,4.003967,4.000515,4.002584,4.005919,3.999546,4.006999
109687837453765494674,4.000014,4.000120,4.000027,4.000050,4.001891,4.0,4.0,4.0,4.000122,3.999968,...,4.000466,3.999675,3.999634,4.000252,3.999857,3.993529,3.996993,4.001294,4.002336,4.014419
115531377356090474644,4.000006,3.999905,3.999993,3.999989,3.998795,4.0,4.0,4.0,4.000128,4.000002,...,3.999865,3.999858,3.998973,3.999820,3.999768,3.999082,4.006305,3.998585,4.001682,4.007449
116283247678981674310,4.000003,3.999956,4.000003,4.000006,4.000530,4.0,4.0,4.0,4.000011,4.000000,...,3.999643,3.999829,4.000322,4.000040,3.999989,3.998512,4.000563,3.998964,4.004204,3.998179


### Helper Functions

In [20]:
# define a function to get all the reviews for a user, sorted by rating in descending order

def get_user_rated_sorted(user_id, df):
    user_reviews = df[df['user_id'] == user_id]
    user_rated = dict(zip(user_reviews['gmap_id'], user_reviews['rating']))
    user_rated = pd.DataFrame(list(user_rated.items()), columns=['gmap_id', 'rating'])
    user_rated.sort_values(by='rating', ascending=False, inplace=True)
    return user_rated

# define a function to get n most popular businesses, popular determined as 1) the most # of reviews and 2) highest average review

def get_popular(n, df):
  popular = df.groupby('gmap_id')['rating'].agg(['count','mean']).sort_values(by=['count','mean'], ascending=False)
  return popular.head(n)

# create a function to return the business name using the gmap_id

def get_business_name(gmap_id):
  meta_generator = parse(path_meta)
  for place in meta_generator:
    if place.get('gmap_id') == gmap_id:
      name = place['name']
      break
  return name

# create a function to return the business rating using the gmap_id

def get_business_rating(gmap_id):
  meta_generator = parse(path_meta)
  for place in meta_generator:
    if place.get('gmap_id') == gmap_id:
      avg_rating = place['avg_rating']
      break
  return avg_rating

### SVD recommender function

In [34]:
'116427980967433332299' in reviews['user_id'].values

False

In [21]:
# define a function to get n_recs for a user using SVD model
# the recommendations are the items with the highest predicted ratings

def get_svd_recommendations(user_id, org_df, pred_df, n_recs):

  print(org_df['user_id'].values)
  # if the user_id is not in predicted_df
  if user_id not in org_df['user_id'].values:
    print(f'User {user_id} has no reviews, recommendations are based on the most popular businesses')

    # return the most popular places
    recs = get_popular(n_recs, org_df)
    recs.reset_index(inplace=True)

    recs.drop(columns='count', inplace=True)
    recs.columns = ['gmap_id','pred_rating']

    recs['name'] = recs['gmap_id'].apply(get_business_name)
    recs['avg_rating'] = recs['gmap_id'].apply(get_business_rating)


  else:
    # get the list of what the user already rated
    user_rated_sorted = get_user_rated_sorted(user_id, org_df)
    num_rated = len(user_rated_sorted)

    # create a dictionary to add recommendations to
    recs = {}

    # get all recommedations, sorted in descending order of predicted rating
    all_recs = pred_df.loc[user_id].sort_values(ascending=False).head(n_recs + num_rated)

    # for each item in all_recs, check if the user has already rated it
    for key, value in all_recs.items():
      if key not in user_rated_sorted['gmap_id'].values:
        recs[key] = value
      if len(recs) == n_recs:
        break

    # change recs into a dataframe with column names
    recs = pd.DataFrame(list(recs.items()), columns=['gmap_id','pred_rating'])

    # add the business name and average rating into the dataframe
    recs['name'] = recs['gmap_id'].apply(get_business_name)
    recs['avg_rating'] = recs['gmap_id'].apply(get_business_rating)

  return recs


In [22]:
# test function 1 (user with no reviews)

get_svd_recommendations('12345', reviews, predicted_df, 5)

['116427980967433332299' '100217552787897641896' '113719864440680408253'
 ... '104620742288190585924' '102429264321348600901'
 '103249695925682618930']
User 12345 has no reviews, recommendations are based on the most popular businesses


Unnamed: 0,gmap_id,pred_rating,name,avg_rating
0,0x875289bef87fcb6b:0x9e865a4dadee3648,4.417339,Fashion Place,4.4
1,0x8752f508b7dc56a9:0x13d77d6e854d7e79,4.491701,City Creek Center,4.5
2,0x8752879236e85383:0xbaf443f16b40940c,4.624454,Loveland Living Planet Aquarium,4.6
3,0x8752f51b7961dc89:0x73f1a804eb909466,4.506818,Hogle Zoo,4.5
4,0x87528e94d44e7fc5:0x42b1cfae697ba4a6,4.281991,Jordan Landing,4.3


In [35]:
# test function 2 (user with reviews)

get_svd_recommendations('116427980967433332299', reviews, predicted_df, 10)

['116427980967433332299' '100217552787897641896' '113719864440680408253'
 ... '104620742288190585924' '102429264321348600901'
 '103249695925682618930']


Unnamed: 0,gmap_id,pred_rating,name,avg_rating
0,0x875287add55327ab:0x5594e747799c742c,4.007759,Walmart Supercenter,3.8
1,0x875289bef87fcb6b:0x9e865a4dadee3648,4.007476,Fashion Place,4.4
2,0x8752f508e4890ee9:0x8c57d6417504663d,4.007254,Temple Square,4.8
3,0x87528a279f0dd025:0xc7f9f41c09effe40,4.007128,Costco Wholesale,4.5
4,0x87528876d86f0e83:0x8dfef306e6b39b8f,4.005724,Megaplex Theatres at Jordan Commons,4.5
5,0x8752f50103ebf04d:0xf0e545531f11b120,4.004695,Vivint Arena,4.6
6,0x875280233a075a0f:0x60df99f574b09b2e,4.004304,Outlets at Traverse Mountain,4.4
7,0x875303b828f60367:0xf86b78c965d59f77,4.004283,Walmart Supercenter,3.8
8,0x875287d17c84d34f:0x7cd695da7be1d5c0,4.00384,Hale Centre Theatre,4.8
9,0x875289e5b2fed5c3:0x2437aeeafcef8d24,4.003742,Wendy's,3.2


# Calculate the RMSE for the model

In [29]:
# change the dataframes to numpy, numpy will change the

pred_np = predicted_df.to_numpy()
train_np = train_ratings.to_numpy()
test_np = test_ratings.to_numpy()

In [30]:
# filter preds for non-nan values in train, filter train for non-nan values

train_pred_np = pred_np[train_ratings.notna()]
train_np = train_np[train_ratings.notna()]

# filter preds for non-nan values in test, filter test for non-nan values

test_pred_np = pred_np[test_ratings.notna()]
test_np = test_np[test_ratings.notna()]

# check the shape of train

train_pred_np.shape, train_np.shape

((241199,), (241199,))

In [31]:
# check the shape of test

test_pred_np.shape, test_np.shape

((60876,), (60876,))

In [32]:
# calculate the RMSE

root_mean_squared_error(train_np, train_pred_np), root_mean_squared_error(test_np, test_pred_np)

(1.0681925856652141, 1.0735364343023943)

# Tune the SVD model

### 1. Tune on train

In [33]:
# tune the SVD model on k to get the model that yields the lowest RMSE

# define a variable to keep track of the best rmse, start with the highest number possible
best_rmse = float('inf')

# define a variable to keep track of the best k
best_k = None

# define range of k's to try
k_range = [20, 50, 100, 150]

# create a loop to try
for k in k_range:
  U, sigma, Vt = svds(train_sparse, k=k)
  sigma = np.diag(sigma)
  pred = U @ sigma @ Vt

  # create a mask with the non-nan's in the pre-demeaned and sparsified dataframe
  mask = train_ratings.notna().values

  # calculate the RMSE
  rmse = root_mean_squared_error(train_sparse.toarray()[mask], pred[mask])

  # if the rmse is lower, replace the previous as the best rmse, and best k
  if rmse < best_rmse:
    best_rmse = rmse
    best_k = k


In [34]:
best_rmse, best_k

(0.9964536397877145, 150)

### 2. Tune on test

In [35]:
# demean test using the mean that was calculated from train

test_demeaned = test_ratings - userMean

# fillna with 0

test_demeaned = test_demeaned.fillna(0).astype(float)

# make the data sparse

test_sparse = csr_matrix(test_demeaned)

  test_demeaned = test_demeaned.fillna(0).astype(float)


In [36]:
# tune the SVD model on k to get the model that yields the lowest test RMSE

# define a variable to keep track of the best rmse, start with the highest number possible
best_rmse = float('inf')

# define a variable to keep track of the best k
best_k = None

# define range of k's to try
k_range = [20, 50, 100, 150]

# create a loop to try
for k in k_range:
  U, sigma, Vt = svds(test_sparse, k=k)
  sigma = np.diag(sigma)
  pred = U @ sigma @ Vt

  # create a mask with the non-nan's in the pre-demeaned and sparsified dataframe
  mask = test_ratings.notna().values

  # calculate the RMSE
  rmse = root_mean_squared_error(test_sparse.toarray()[mask], pred[mask])

  # if the rmse is lower, replace the previous as the best rmse, and best k
  if rmse < best_rmse:
    best_rmse = rmse
    best_k = k


In [37]:
best_rmse, best_k

(1.0026504269302328, 150)

### Retrain model using the best k

In [38]:
# matrix factorization using svds

U, sigma, Vt = svds(train_sparse, k=150)

# diagonalize the sigma

sigma = np.diag(sigma)

In [39]:
# get the predictions by taking 2 dot products in succession
# 1. dot product of U and sigma
# 2. dot product of the above and Vt

predicted_ratings = U @ sigma @ Vt

In [40]:
# add back the mean

predicted_ratings += userMean

# turn it into a dataframe

predicted_df = pd.DataFrame(predicted_ratings, columns=train_ratings.columns, index=train_ratings.index)

In [41]:
# use clip to clip anything lower than 1 to 1, and anything higher than 5 to 5

predicted_df = np.clip(predicted_df, 1, 5)

In [42]:
# check the predictions

predicted_df

Unnamed: 0,0x87528767d0ec0e4d:0x7a2f1637a6fb6925,0x875287dced2a1f1f:0xf3764a9211d4f382,0x8752841a66574037:0x6a51c0f67ca3002,0x8752f365da14f295:0x264218c77da46a71,0x875287d36010a61b:0xab575aa5992155b3,0x87528440b7ee298f:0x525424c7bbad1c2c,0x80ca44f91c78df35:0x4de85c05e1117565,0x874d9bf9d1db7c85:0xf1c3706a2b497a3,0x875261e8b5a75a37:0x9d82b30caf13f89d,0x874d907b753aa797:0x1b835764d4684400,...,0x80cb298797a23155:0x682d89a022c5d033,0x8734d418624bd435:0x7d7e27921054e85e,0x80caf3e4ff8f7bcd:0x93be0c0a28cb63ce,0x8734f86c18c4aa2d:0xfa57972b3f6d329a,0x80ca44998390637d:0x12a7f30929faf885,0x874d909522cb134d:0x9c7d37589ba82b27,0x87528714131dc5e1:0x57130d869abea558,0x80ca453710952cab:0x70a40bca02898341,0x874ebbe4a8181e5b:0xb5afc0f672d353ef,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
116427980967433332299,4.000016,3.999961,4.000000,4.000004,3.999680,4.0,4.0,4.0,4.000021,4.000000,...,4.000180,4.000053,3.999731,4.000017,3.999958,4.000351,4.001875,4.000455,3.999769,4.003237
100217552787897641896,3.999961,4.004470,4.000035,3.999954,4.003231,4.0,4.0,4.0,4.000398,3.999654,...,4.003615,3.998861,4.002876,4.000564,4.001857,4.002792,3.994860,3.992100,3.998246,3.985941
113719864440680408253,4.000000,4.000035,4.000064,3.999991,3.999848,4.0,4.0,4.0,3.999975,3.999990,...,4.001706,4.000129,4.001026,4.000040,4.000730,4.000033,4.000518,3.997481,4.000240,3.999555
114710026425309062285,4.000004,3.999954,3.999991,4.000177,4.000453,4.0,4.0,4.0,4.000126,3.999990,...,4.000083,3.999832,4.000384,4.000013,3.999992,4.001031,4.000863,4.002993,3.999573,3.996493
114105421795834263422,3.999929,3.999978,3.999959,3.999860,4.016722,4.0,4.0,4.0,3.999750,4.000107,...,3.995807,4.006097,3.998318,4.000425,3.997472,3.995481,4.040272,4.000448,3.996724,4.044256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105646211836738285046,4.000005,4.000250,4.000087,3.999987,4.000124,4.0,4.0,4.0,3.999866,4.000053,...,4.011551,4.000655,4.017009,4.000083,4.005804,4.005128,4.004594,3.996572,3.998239,3.996287
109687837453765494674,4.000049,4.000076,3.999979,4.000028,4.002655,4.0,4.0,4.0,4.000111,3.999982,...,3.997215,4.000113,3.998293,4.000002,3.998756,4.001381,4.007530,3.999544,3.998055,4.033119
115531377356090474644,3.999985,3.999538,4.000055,3.999995,4.001197,4.0,4.0,4.0,3.999948,4.000005,...,3.996987,3.999420,3.999477,3.999536,4.000055,3.999050,4.003609,4.000372,3.999605,4.009863
116283247678981674310,4.000021,4.000016,4.000007,4.000017,3.997892,4.0,4.0,4.0,4.000159,3.999972,...,3.998740,4.001662,3.998959,4.000285,4.000012,4.004081,4.010821,4.000664,4.003487,4.004092


In [43]:
predicted_df

Unnamed: 0,0x87528767d0ec0e4d:0x7a2f1637a6fb6925,0x875287dced2a1f1f:0xf3764a9211d4f382,0x8752841a66574037:0x6a51c0f67ca3002,0x8752f365da14f295:0x264218c77da46a71,0x875287d36010a61b:0xab575aa5992155b3,0x87528440b7ee298f:0x525424c7bbad1c2c,0x80ca44f91c78df35:0x4de85c05e1117565,0x874d9bf9d1db7c85:0xf1c3706a2b497a3,0x875261e8b5a75a37:0x9d82b30caf13f89d,0x874d907b753aa797:0x1b835764d4684400,...,0x80cb298797a23155:0x682d89a022c5d033,0x8734d418624bd435:0x7d7e27921054e85e,0x80caf3e4ff8f7bcd:0x93be0c0a28cb63ce,0x8734f86c18c4aa2d:0xfa57972b3f6d329a,0x80ca44998390637d:0x12a7f30929faf885,0x874d909522cb134d:0x9c7d37589ba82b27,0x87528714131dc5e1:0x57130d869abea558,0x80ca453710952cab:0x70a40bca02898341,0x874ebbe4a8181e5b:0xb5afc0f672d353ef,0x875287112eb7ef65:0x3aefb61bbcb5f2a9
116427980967433332299,4.000003,3.999990,4.000001,4.000000,4.000118,4.0,4.0,4.0,4.000004,4.000000,...,4.000038,4.000099,3.999953,4.000027,4.000008,4.000024,4.000994,4.000267,3.999940,4.001817
100217552787897641896,3.999990,4.000508,4.000005,3.999982,4.000829,4.0,4.0,4.0,4.000015,3.999944,...,4.001108,3.999222,4.001699,3.999969,4.000230,3.996648,3.996059,3.999675,4.001826,3.987627
113719864440680408253,4.000001,4.000005,4.000004,4.000003,4.000116,4.0,4.0,4.0,3.999995,4.000003,...,4.000358,3.999926,4.000824,4.000028,4.000271,3.999992,3.999744,4.000590,3.999861,4.000631
114710026425309062285,4.000000,3.999982,4.000003,4.000043,4.000292,4.0,4.0,4.0,4.000048,3.999999,...,4.000222,3.999703,3.999966,4.000009,4.000050,4.000087,4.000465,4.000803,4.000194,3.999117
114105421795834263422,4.000000,4.000400,4.000036,4.000050,4.005781,4.0,4.0,4.0,4.000127,3.999956,...,4.003368,3.998631,3.997162,4.000474,3.999757,3.996051,4.000010,4.003196,3.995960,3.989056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105646211836738285046,4.000001,4.000043,4.000022,3.999998,3.999519,4.0,4.0,4.0,3.999980,4.000043,...,4.004075,3.999946,4.012891,4.000278,4.003967,4.000515,4.002584,4.005919,3.999546,4.006999
109687837453765494674,4.000014,4.000120,4.000027,4.000050,4.001891,4.0,4.0,4.0,4.000122,3.999968,...,4.000466,3.999675,3.999634,4.000252,3.999857,3.993529,3.996993,4.001294,4.002336,4.014419
115531377356090474644,4.000006,3.999905,3.999993,3.999989,3.998795,4.0,4.0,4.0,4.000128,4.000002,...,3.999865,3.999858,3.998973,3.999820,3.999768,3.999082,4.006305,3.998585,4.001682,4.007449
116283247678981674310,4.000003,3.999956,4.000003,4.000006,4.000530,4.0,4.0,4.0,4.000011,4.000000,...,3.999643,3.999829,4.000322,4.000040,3.999989,3.998512,4.000563,3.998964,4.004204,3.998179


In [42]:
predicted_df.loc['116427980967433332299'].sort_values(ascending=False).head(10)

0x875287add55327ab:0x5594e747799c742c    4.007759
0x875289bef87fcb6b:0x9e865a4dadee3648    4.007476
0x8752f508e4890ee9:0x8c57d6417504663d    4.007254
0x87528a279f0dd025:0xc7f9f41c09effe40    4.007128
0x87528876d86f0e83:0x8dfef306e6b39b8f    4.005724
0x8752f50103ebf04d:0xf0e545531f11b120    4.004695
0x875280233a075a0f:0x60df99f574b09b2e    4.004304
0x875303b828f60367:0xf86b78c965d59f77    4.004283
0x875287d17c84d34f:0x7cd695da7be1d5c0    4.003840
0x875289e5b2fed5c3:0x2437aeeafcef8d24    4.003742
Name: 116427980967433332299, dtype: float64

In [44]:
# save the predictions

predicted_df.to_parquet('data/svd_preds.parquet', index=True)