In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

# Simple Content-Based Recommendation
## Trail Description Only

In [4]:
trail_data = pd.read_pickle('/Users/briangraham/insight/trailrec/data/trail_info_combined.pickle')
# replace NaN with empty strings ''
trail_data['description'] = trail_data['description'].fillna('')
trail_data = trail_data.reset_index()
#vectorize
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(trail_data['description'])
tfidf_matrix.shape

(3935, 7255)

In [5]:
from sklearn.metrics.pairwise import linear_kernel
# dot product to get cosine sim
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
indices = pd.Series(trail_data.index, index=trail_data['trail_id'])

In [7]:
def get_recommendations(trail_id,cosine_sim = cosine_sim):
    idx = indices[trail_id]
    
    # Get the pairwsie similarity scores of all trails with that trail
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort trails using similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # 10 most similar trails
    sim_scores = sim_scores[1:11]
    
    # Get trails
    trail_indices = [i[0] for i in sim_scores]
    
    return trail_data['trail_id'].iloc[trail_indices]    

In [8]:
get_recommendations('1-87-dh',cosine_sim = cosine_sim)

2099                memphis
877                    dcdh
1262    freight-train-upper
1015              dreadhead
218             barn-burner
2437             pale-rider
3278                   styx
3603        ultimate-frisby
2966           section-zero
2033                   mach
Name: trail_id, dtype: object

In [34]:
trail_data['description'].loc[indices['memphis']]

'Another outstanding Prevost trail that likely played a big role in grooming Steve Smith towards the UCI World Cup DH championship. Steep, fast, twisty, with lots of challenging features multiple drops and booters, a significant  rock drop, lots of roots, off camber sections, and lots of opportunity for spontaneous line creativity. Also often used as an Island Cup DH race course.'

In [10]:
trail_data['description'].loc[indices['1-87-dh']]

'Sustained steep and fast DH trail often used as an Island Cup DH race course. Lots of booters, berms, straightaways, step-ups, step-downs, a steep treed section - a classic island trail.'

In [11]:
trail_data['description'].loc[indices['barn-burner']]

'This intermediate machine made trail has lots of berms with a step up, step down, and some steep sections. Be sure to pin it on the last berm which is over six feet tall.'

In [12]:
trail_data['description'].loc[indices['dcdh']]

'Is the islands only DH race in the island series.'

In [45]:
indices = np.argsort(tfidf.idf_)[::-1]
features = tfidf.get_feature_names()
top_n = 100
top_features = [feature_names[i] for i in indices[:top_n]]
print(top_features)

['zoolander', 'met', 'merritt', 'bug', 'merely', 'mercy', 'merchant', 'mequinna', 'menu', 'mentum', 'mental', 'menhinick', 'menhinck', 'memory', 'buildings', 'memorable', 'member', 'melts', 'melt', 'builds', 'mellowed', 'buildup', 'megasaurus', 'megahurts', 'mega', 'meeting', 'bullet', 'bullets', 'mediator', 'medford', 'bump', 'merry', 'metalvery', 'remoteness', 'methanex', 'mixing', 'mixes', 'mixedamongst', 'browny', 'brsa100', 'brushy', 'bs', 'minus', 'btlc', 'ministry', 'btweaked', 'minimalist', 'mindset', 'mindful', 'mindbender', 'min', 'mimic', 'millstream', 'millipede', 'bubbling', 'milks', 'milking', 'milk', 'miles', 'buckethead', 'buddies', 'mic', 'mettle', 'metro', 'bumped', 'bumping', 'meander', 'bunnies', 'marley', 'markle', 'markingthis', 'marker', 'burmed', 'marine', 'marginally', 'marbley', 'marathon', 'mar', 'mapping', 'mapped', 'manyfall', 'manual', 'manpower', 'manouvering', 'manoeuvre', 'manning', 'manner', 'manmade', 'manhandler', 'mangle', 'maneuvers', 'manditory', 

# Model based on metadata only
## has not been edited yet...

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(trail_data['description'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [15]:
get_recommendations('1-87-dh', cosine_sim2)

2099              memphis
218           barn-burner
2966         section-zero
2347           numbskulls
3603      ultimate-frisby
877                  dcdh
1018        drifter-41753
665               choices
541     byrne-creek-trail
3278                 styx
Name: trail_id, dtype: object

In [28]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)