In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import surprise
import psycopg2

# Get proxy ratings for trails based on ridelogs

In [2]:
# Query data from SQL database
conn = psycopg2.connect("host=localhost dbname=trailrec user=briangraham")
cur = conn.cursor()
sql_query = """
SELECT trail_id,ride_date,rider_name,rider_url FROM ridelogs;
"""
df = pd.read_sql_query(sql_query,conn)

In [3]:
df.head()

Unnamed: 0,trail_id,ride_date,rider_name,rider_url
0,1-87-dh,Jul 5,BCpov,https://www.trailforks.com/profile/BCpov/
1,1-87-dh,Jul 5,canadaka,https://www.trailforks.com/profile/canadaka/
2,1-87-dh,May 27,campasternak,https://www.trailforks.com/profile/campasternak/
3,1-87-dh,May 21,campasternak,https://www.trailforks.com/profile/campasternak/
4,1-87-dh,May 20,superlightracer,https://www.trailforks.com/profile/superlightr...


In [4]:
# get effective ratings/trail/unique rider
rider_name_list = df['rider_name'].unique()

In [5]:
ridecounts = df.groupby('rider_name')['trail_id'].count()

In [6]:
df_rider_total = pd.DataFrame({'ride_counts':ridecounts})
df_rider_total.head()

Unnamed: 0_level_0,ride_counts
rider_name,Unnamed: 1_level_1
0-austintatious-0,37
007123,12
00dinder,14
06gmc,2
0ut5ide,45


In [7]:
df_rider_trail_counts_group = df.groupby(['rider_name','trail_id'])
df_rider_trail_counts = df_rider_trail_counts_group.count()
df_rider_trail_counts = df_rider_trail_counts[['ride_date']]

In [8]:
df_rider_trail_counts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ride_date
rider_name,trail_id,Unnamed: 2_level_1
0-austintatious-0,aline-lower,3
0-austintatious-0,central-scrutinizer,1
0-austintatious-0,comfortably-numb-foreplay-descent,1
0-austintatious-0,crank-it-up-lower,1
0-austintatious-0,crank-it-up-upper,1


In [106]:
df_combined = df_rider_trail_counts.join(df_rider_total,on='rider_name')
df_combined.columns = ['nrides', 'total_rides']
df_combined['rider_rating'] = df_combined['nrides']/df_combined['total_rides']
df_combined

Unnamed: 0_level_0,Unnamed: 1_level_0,nrides,total_rides,rider_rating
rider_name,trail_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0-austintatious-0,aline-lower,3,37,0.081081
0-austintatious-0,central-scrutinizer,1,37,0.027027
0-austintatious-0,comfortably-numb-foreplay-descent,1,37,0.027027
0-austintatious-0,crank-it-up-lower,1,37,0.027027
0-austintatious-0,crank-it-up-upper,1,37,0.027027
0-austintatious-0,dinah-moe-humm,1,37,0.027027
0-austintatious-0,drop-in-clinic,1,37,0.027027
0-austintatious-0,duffman-lower,1,37,0.027027
0-austintatious-0,duffman-upper,1,37,0.027027
0-austintatious-0,easy-does-it-lower,3,37,0.081081


# Modeling

In [82]:
# prepare data
df_sub =  df_combined[df_combined['total_rides'] > 100] # option to take a subset of users
df_formatted = df_sub.reset_index()[['rider_name','trail_id','rider_rating']]
df_formatted.shape

(253360, 3)

In [99]:
# prepare data
df_sub =  df_combined[df_combined['total_rides'] > 500] # option to take a subset of users
df_formatted = df_sub.reset_index()[['rider_name','trail_id','nrides']]
df_formatted.nrides[df_formatted['nrides'] > 5] = 5
df_formatted.head()

Unnamed: 0,rider_name,trail_id,nrides
0,AFM73,50-shades-of-green-part-2,1
1,AFM73,backdoor-0,5
2,AFM73,bear-back,5
3,AFM73,bear-buns-48564,2
4,AFM73,bear-ridge-31224,5


In [84]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler

In [100]:
scaler = MinMaxScaler(feature_range=(1, 5), copy=True)
scaler.fit(df_formatted[['nrides']])
df_formatted['nrides'] = scaler.transform(df_formatted[['nrides']])
df_formatted.head()

Unnamed: 0,rider_name,trail_id,nrides
0,AFM73,50-shades-of-green-part-2,1.0
1,AFM73,backdoor-0,5.0
2,AFM73,bear-back,5.0
3,AFM73,bear-buns-48564,2.0
4,AFM73,bear-ridge-31224,5.0


In [102]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_formatted[['rider_name','trail_id','nrides']], reader)

In [105]:
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import SVD
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 1.1897


1.1897395888190727

In [109]:
accuracy.mae(predictions)

MAE:  0.9392


0.9392142544697032

In [90]:
from collections import defaultdict
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [91]:
top_n = get_top_n(predictions, n=10)

In [93]:
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    (uid, [iid for (iid, _) in user_ratings])

Mountainrider59 ['huff--puff', 'premature-excavation', 'mercy-me-1383', 'new-severed-entrance', 'take-it-easy', 'femur', 'quadzilla-lower-28151', 'upper-john-deer', 'angry-midget', 'barking-dog']
jaclynj ['shorn-scrotum', 'penny-lane', 'cardiac-bypass', 'pingu', '7th-secret', 'ak89', 'skid-rd-connector', 'ramble-on-28051', 'd-yer-mak-er', 'mushroom-trail']
jrshields22 ['upper-gearjammer', 'expresso', 'upper-lower-gearjammer', 'uppercut', 'new-severed-entrance', 'big-trouble-little-chainring', '50-shades-of-green-part-2', 'high-school-league', 'pingu', 'children-s-adventure-playground']
GrumpyKat ['mary-jane', 'spanky-and-our-gang', 'nookie', 'corona', 'rusty-nail-34315', 'black-fly', 'hayden', 'black-hole-49335', 'purgatory', 'home-run-1492']
KeithLH ['2cents', 'merv-s-2752', 'shed-bike-trail', 'lost-oakly', 'upper-da-plow', 'big-trouble-little-chainring', 'valley-trail-day-lots-to-upper-village', 'wasp', 'family-cross', 'squid-line']
mtbkid13 ['arizona', 'skyline-to-ov', 'skyline-3747

alexcapon ['selkirk-slacker', 'connector-76889', 'trial--error', 'kobe-s', 'branch-7-fsr-climb', 'peaches-en-regalia', 'toads-short-forest', 'scallimag', 'zoot-allures', 'antler']
diegogarciamtb ['crank-it-up-upper', 'lower-expresso', 'drop-in-clinic', 'karate-monkey', 'schleyer', 'water-towers-connector', 'bline-upper', 'braemar-trail', 'floppy-bunny', 'micro-climate-access']
northvanct ['for-the-kids', 'water-towers-connector', 'skid-rd-connector', 'executioner-connector', 'ramble-on-28051', 'empress-bypass', 'hydraulic-connector', 'family-cross', 'upper-dale-s-trail', 'leave-of-absence']
WhatAboutBob ['miner-29-er', 'bearacouga-37465', 'galloping', 'west-vista', 'fiori', 'bighorn', 'mesa', 'big-easy', 'miller-time-adaptive-loop', 'campbell-dh']
AGriez ['starz-line-2', 'upper-griffen', 'lower-snake', 'crinkum-crankum', 'ramble-on-28051', 'braemar-trail', 'upper-gearjammer', 'shorn-scrotum', 'academy-climb', 'upper-john-deer']
sitobello ['griffen-uphill', 'bridle-path-110901', 'lower-

ryan-97 ['new-severed-entrance', 'pseudotsuga-part-3', 'griffen-uphill', 'pingu', 'floppy-bunny', 'upper-john-deer', 'skid-rd-connector', 'pangor', 'tazer-s-alternate-25815', 'pile-rocks']
Vingedon ['cemetery-bypass', 'kiddie-up', 'rtrail', 'fairy-creek-bridge-trail', 'broken-derailleur', 'red-sonja', 'kush-36194', 'snoop-lion', 'visitor-centre-trail', 'tr3']
VMoney ['femur', 'mongo-945', 'spotlem', '7th-secret', 'the-way-of-the-noodle', 'duck-farm-loop', 'kerry-d-over']
AdeMiller ['world-cup-single-track', 'karate-monkey', 'devils-club', 'no-joke-upper', 'blueseum', 'blue-velvet-lower', 'freight-train--no-joke', 'blueberry-bathtub', 'in-deep', 'a-cut-above']
Lescrawley ['dog-on-it-33399', 'i-m-not-satisfied-6729', 'dusty-beaver', 'mikes', 'boot-chute', 'pseudotsuga-part-1', 'tower-classic', '50-shades-of-green-part-2', 'stump-town', 'whiskey-business']
KalumI ['sidewinder-7076', 'bobsled', 'shorn-scrotum', 'academy-climb', 'new-severed-entrance', 'kirkford', 'john-deer', 'ambidextrous

Dan1985 ['tracks-from-hell', 'lower-flume-25695', 'angry-midget', 'sidewinder-1', 'lower-expresso', 'micro-climate-access', 'cardiac-bypass', 'heart-darkness', 'out-of-the-dust', 'r--r-climb']
drugo ['hornet', 'world-cup-single-track', 'schleyer', 'blueseum', 'angry-pirate-upper', 'expresso', 'little-alder', 'packard-goose', 'upper-oilcan', 'tracks-from-hell']
ArturJKelowna ['lower-apex-trail-access', 'clifton-ridgeline', 'balsom-root-bluff-trail', 'lookout-access', 'shale-trail', 'bacon-by-ndit', 'bear-bones', 'rsvp', 'holy-pail-up-to-soultrain-kid-dino-myt', 'rocky-screech']
mschlender ['penny-lane', 'new-severed-entrance', '50-shades-of-green-part-2', 'upper-john-deer', 'kirkford', 'tracks-from-hell', 'pseudotsuga-part-1', 'high-school-league', 'pseudotsuga-part-3', 'pseudotsuga-part-2']
ryan788h ['crank-it-up-upper', 'aline-lower', 'pulp-fiction', 'bline-upper', 'duffman-upper', 'bline-lower', 'crabapple-turns', 'freight-train--no-joke', 'beach-cut', 'i-m-not-satisfied-6729']
basti

In [110]:
from surprise.model_selection import KFold
from collections import defaultdict
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [None]:
from surprise import Dataset
data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))