In [9]:
import pickle
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import surprise
import psycopg2
import matplotlib.pyplot as plt
from surprise import Reader,NormalPredictor,accuracy,SVD,Dataset,evaluate,CoClustering,NMF,KNNBasic
from surprise.model_selection import cross_validate,train_test_split,KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from surprise import dump

In [3]:
# Query data from SQL database
conn = psycopg2.connect("host=localhost dbname=trailrec user=briangraham")
cur = conn.cursor()
sql_query = """
SELECT trail_id,ride_date,rider_name,rider_url FROM ridelogs;
"""
df = pd.read_sql_query(sql_query,conn)

In [5]:
# get total ride counts
rider_name_list = df['rider_name'].unique()
ridecounts = df.groupby('rider_name')['trail_id'].count()
df_rider_total = pd.DataFrame({'ride_counts':ridecounts})

# get total ride counts by trail
df_rider_trail_counts_group = df.groupby(['rider_name','trail_id'])
df_rider_trail_counts = df_rider_trail_counts_group.count()
df_rider_trail_counts = df_rider_trail_counts[['ride_date']]

# Combine and get rider ratings
df_combined = df_rider_trail_counts.join(df_rider_total,on='rider_name')
df_combined.columns = ['nrides', 'total_rides']

df_combined['ride_fraction'] = df_combined['nrides']/df_combined['total_rides']
df_combined['log_ride_fraction'] = (np.log(df_combined['ride_fraction']))
scaler = MinMaxScaler(feature_range=(1, 5), copy=True)
scaler.fit(df_combined[['log_ride_fraction']])
df_combined['scaled_log_ride_fraction'] = scaler.transform(df_combined[['log_ride_fraction']])

In [7]:
# prepare data 
df_sub = df_combined
df_formatted = df_sub.reset_index()
df_formatted.head()

Unnamed: 0,rider_name,trail_id,nrides,total_rides,ride_fraction,log_ride_fraction,scaled_log_ride_fraction
0,0-austintatious-0,aline-lower,3,37,0.081081,-2.512306,3.830262
1,0-austintatious-0,central-scrutinizer,1,37,0.027027,-3.610918,3.318744
2,0-austintatious-0,comfortably-numb-foreplay-descent,1,37,0.027027,-3.610918,3.318744
3,0-austintatious-0,crank-it-up-lower,1,37,0.027027,-3.610918,3.318744
4,0-austintatious-0,crank-it-up-upper,1,37,0.027027,-3.610918,3.318744


In [8]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_formatted[['rider_name','trail_id','scaled_log_ride_fraction']], reader)

In [10]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

In [21]:
%time
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 10 µs
0.481898301678084
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [25]:
param_grid2 = {'n_epochs': [5,20], 'lr_all': [0.002, 0.010],
              'reg_all': [0.2, 0.6]}

In [28]:
gs2 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs2.fit(data)

# best RMSE score
print(gs2.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs2.best_params['rmse'])
%time

0.4862330952282517
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 10 µs


In [None]:
# Same output when extending grid search.  Keep original algo

In [31]:
algo = gs.best_estimator['rmse']
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11b232518>

In [33]:
from collections import defaultdict

from surprise import SVD
from surprise import Dataset


def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [36]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
# This takes a while...
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [77]:
df = pd.DataFrame(predictions)

In [83]:
df.shape

(26819118, 5)

In [86]:
df.head(n=1)

Unnamed: 0,uid,iid,r_ui,est,details
0,0-austintatious-0,fool-s-gold-9921,2.78538,3.095925,{'was_impossible': False}


In [88]:
df.dtypes

uid         object
iid         object
r_ui       float64
est        float64
details     object
dtype: object

In [105]:
gl_float = df.select_dtypes(include=['float'])
df_mem_opt = gl_float.apply(pd.to_numeric,downcast='float')

In [108]:
df_mem_opt['iid'] = df['iid'].astype('category')
df_mem_opt['uid'] = df['uid'].astype('category')

In [91]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [109]:
mem_usage(df_mem_opt)

'308.12 MB'

In [111]:
df_mem_opt = df_mem_opt[['uid','iid','est','r_ui']]

In [113]:
df_mem_opt.to_pickle('all_rider_trail_precitions.pickle')

In [120]:
df_mem_opt['est'].pdf()

AttributeError: 'Series' object has no attribute 'pdf'

In [37]:
top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

KeyboardInterrupt: 