In [89]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

Load Cleaned Data

In [10]:
hikes_df = pickle.load(open('../src/cleaned_hike_desc.pickle', 'rb'))
hikes_df.set_index('hike_id', inplace=True)

reviews = pickle.load(open('../src/cleaned_reviews.pickle', 'rb'))

In [11]:
reviews.set_index(['hike_id', 'user_id'], inplace=True)
reviews.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_desc,cleaned_reviews
hike_id,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1
hike_1,steph-scott-2,Beautiful in fall. Went on a Tuesday late morn...,beautiful fall go tuesday late morning midfall...
hike_1,diana-pinho-3,went on a Tuesday AM around 9ish and didn't ex...,go tuesday around ish do not expect see many p...
hike_1,jason-pennacchio,"Hiked Oct 2020, hiked from parking lot #3. Arr...",hiked oct hike parking lot arrive lot full ext...
hike_1,lolita-dickson,We started with the blue trail and finished wi...,start blue trail finish red one slippery today...
hike_1,allie-jay-1,"Definitely start with the red trail , it’s ste...",definitely start red trail steep difficult get...


Set up Document Term Matrix with CountVectorizer

## SVD

In [60]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

Trail Descriptions

In [80]:
vectorizer = TfidfVectorizer(max_features=2000,  min_df=10)
hikes_dtm = vectorizer.fit_transform(hikes_df['cleaned_descriptions'])

lsa = TruncatedSVD(5)
hikes_topic = lsa.fit_transform(hikes_dtm)

display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
trail, use, loop, also, able, dog, mile, near, locate, traffic

Topic  1
trip, nature, primarily, hiking, walk, bird, watch, use, run, set

Topic  2
rate, moderate, well, october, wild, flower, beautiful, use, features, difficult

Topic  3
chance, see, wildlife, features, lake, offer, rate, moderate, run, loop

Topic  4
wildlife, see, chance, offer, wild, flower, beautiful, bird, watch, good


*The terms "trail" and "also" shows up in multiple topics.  Maybe consider adding to the stop words.*
### Topic breakdown:
* Topic 0: Things the Trail offers
* Topic 1: Not much?
* Topic 2/3: Describe the trail features

In [68]:
# Descriptions
nmf_model = NMF(20)
trail_desc = nmf_model.fit_transform(hikes_dtm)
display_topics(nmf_model, vectorizer.get_feature_names(), 10)


Topic  0
trail, locate, near, mile, virginia, running, pennsylvania, appalachian, west, valley

Topic  1
trail, offer, number, option, activity, traffic, mile, locate, near, lightly

Topic  2
good, skill, level, feature, near, locate, mile, walk, trail, carolina

Topic  3
loop, trail, feature, locate, near, mile, waterfall, massachusetts, run, connecticut

Topic  4
leash, kept, must, dog, also, able, use, trail, virginia, pennsylvania

Topic  5
april, use, well, september, traffic, october, november, bird, heavily, watch

Topic  6
wild, flower, beautiful, feature, near, locate, mile, colorado, mountain, virginia

Topic  7
offer, wildlife, see, chance, near, locate, mile, lightly, back, virginia

Topic  8
forest, great, set, feature, near, locate, mile, virginia, massachusetts, walk

Topic  9
nature, trip, use, primarily, hiking, walk, bird, traffic, watch, mile

Topic  10
accessible, yearround, heavily, traffic, carolina, tennessee, feature, north, near, locate

Topic  11
rate, modera

Trail Reviews

In [81]:
vectorizer = TfidfVectorizer(max_features=2000, min_df=10, max_df=0.4)
reviews_dtm = vectorizer.fit_transform(reviews['cleaned_reviews'])

In [88]:
lsa = TruncatedSVD(5)
reviews_topics = lsa.fit_transform(reviews_dtm)
display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
great, view, not, beautiful, nice, easy, get, go, lot, good

Topic  1
great, view, beautiful, kid, easy, family, awesome, short, amazing, workout

Topic  2
nice, beautiful, easy, view, walk, short, waterfall, fall, scenery, quick

Topic  3
beautiful, view, worth, amazing, fall, love, absolutely, top, waterfall, lake

Topic  4
view, top, amazing, awesome, worth, good, steep, nice, climb, summit


Add hikes and trail to stopwords

In [87]:
# Descriptions
nmf_model = NMF(5)
trail_desc = nmf_model.fit_transform(reviews_dtm)

In [86]:
display_topics(nmf_model, vectorizer.get_feature_names(), 10)


Topic  0
not, go, get, lot, pron, mile, back, way, take, see

Topic  1
great, kid, view, dog, family, little, day, place, love, workout

Topic  2
nice, walk, little, short, really, wood, view, dog, place, loop

Topic  3
beautiful, fall, waterfall, love, scenery, lake, day, absolutely, view, place

Topic  4
view, top, amazing, awesome, worth, good, steep, short, climb, summit

Topic  5
easy, walk, pretty, kid, fun, follow, short, dog, well, family


* Topic 0: Not much
* Topic 1 & 2: Family Friendly
* Topic 3: Categorizes easier hikes
* Topic 4: Harder hikes with good payout

In [90]:
# LDA 
vectorizer = CountVectorizer(max_features=2000,  min_df=10)
hikes_dtm = vectorizer.fit_transform(hikes_df['cleaned_descriptions'])

lda_hikes = LatentDirichletAllocation(n_components=2)
lda_hikes.fit(hikes_dtm)

LatentDirichletAllocation(n_components=2)

In [92]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_hikes, hikes_dtm, vectorizer)

  """
