In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

# Simple Content-Based Recommendation
## Trail Description Only

In [31]:
trail_data = pd.read_pickle('/Users/briangraham/insight/trailrec/data/trail_info_combined.pickle')
# replace NaN with empty strings ''
trail_data['description'] = trail_data['description'].fillna('')
trail_data = trail_data.reset_index()
#vectorize
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(trail_data['description'])
tfidf_matrix.shape

(3935, 7255)

In [32]:
from sklearn.metrics.pairwise import linear_kernel
# dot product to get cosine sim
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [38]:
indices = pd.Series(trail_data.index, index=trail_data['trail_id'])

In [45]:
def get_recommendations(trail_id,cosine_sim = cosine_sim):
    idx = indices[trail_id]
    
    # Get the pairwsie similarity scores of all trails with that trail
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort trails using similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # 10 most similar trails
    sim_scores = sim_scores[1:11]
    
    # Get trails
    trail_indices = [i[0] for i in sim_scores]
    
    return trail_data['trail_id'].iloc[trail_indices]    

In [46]:
get_recommendations('1-87-dh',cosine_sim = cosine_sim)

2099                memphis
877                    dcdh
1262    freight-train-upper
1015              dreadhead
218             barn-burner
2437             pale-rider
3278                   styx
3603        ultimate-frisby
2966           section-zero
2033                   mach
Name: trail_id, dtype: object

In [139]:
trail_data['description'].loc[indices['memphis']]

'Another outstanding Prevost trail that likely played a big role in grooming Steve Smith towards the UCI World Cup DH championship. Steep, fast, twisty, with lots of challenging features multiple drops and booters, a significant  rock drop, lots of roots, off camber sections, and lots of opportunity for spontaneous line creativity. Also often used as an Island Cup DH race course.'

In [140]:
trail_data['description'].loc[indices['1-87-dh']]

'Sustained steep and fast DH trail often used as an Island Cup DH race course. Lots of booters, berms, straightaways, step-ups, step-downs, a steep treed section - a classic island trail.'

In [141]:
trail_data['description'].loc[indices['barn-burner']]

'This intermediate machine made trail has lots of berms with a step up, step down, and some steep sections. Be sure to pin it on the last berm which is over six feet tall.'

In [142]:
trail_data['description'].loc[indices['dcdh']]

'Is the islands only DH race in the island series.'

# Model based on metadata only

In [146]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [147]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(trail_data['description'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [150]:
get_recommendations('1-87-dh', cosine_sim2)

2099              memphis
218           barn-burner
2966         section-zero
2347           numbskulls
3603      ultimate-frisby
877                  dcdh
1018        drifter-41753
665               choices
541     byrne-creek-trail
3278                 styx
Name: trail_id, dtype: object