# Trail Reviews Visualization Notebook 
Author: Andrew Auyeung. 
The contents of this notebook are the visualizations done after the Trail Reviews are cleaned and have undergone some form of topic modeling.

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import nlp_vis

In [None]:
reviews = pd.read_csv("../src/cleaned_reviews_5.csv", index_col=0)
reviews.dropna(inplace=True)

In [None]:
vectorizer = TfidfVectorizer(max_features=1000, max_df=0.7, min_df=0.01, ngram_range=(1,2), stop_words='english')
r_dtm = vectorizer.fit_transform(reviews['cleaned_reviews'])

In [None]:
nmf_model = NMF(n_components=10, random_state=None)
nmf_model.fit(r_dtm)

nlp_vis.display_topics(model=nmf_model, feature_names=vectorizer.get_feature_names(), no_top_words=10)

In [None]:
topic_names = []
words = vectorizer.get_feature_names()
for topic in nmf_model.components_.argsort()[:,:-4:-1]:
    curr_names = []
    for i in topic:
        curr_names.append(words[i])
    topic_names.append(' ,'.join(curr_names))

In [None]:
topic_names

In [None]:
topic_results = nmf_model.transform(r_dtm)

In [None]:
X = pd.DataFrame(topic_results, index=reviews.index, columns=topic_names)

y = topic_results.argmax(axis=1)


# Visualization of NMF

## PCA Vis

In [None]:
pca = PCA(n_components=2)
pca_features = pca.fit_transform(X)

In [None]:
nlp_vis.plot_PCA_2D(pca_features, y, X.columns)

## t-SNE Vis

In [None]:
tsne = TSNE(n_components=2, n_iter=300)
t_results = tsne.fit_transform(X)

In [None]:
nlp_vis.plot_tSNE_2D(t_results, y, X.columns)

In [None]:
lsa_level_2 = TruncatedSVD(n_components=6)
lsa_level_2.fit(X)

In [None]:
topic_results_level2 = lsa_level_2.transform(X)

In [None]:
# Get Topic Clusters
topic_clusters = []
for cluster in lsa_level_2.components_.argsort()[:,:-4:-1]:
    curr_names = []
    for i in cluster:
        curr_names.append(topic_names[i])
    topic_clusters.append(' ,'.join(curr_names))

In [None]:
topic_clusters

# CorEx Topic Modeling

In [None]:
from nltk.corpus import stopwords
from corextopic import corextopic as ct
from corextopic import vis_topic as vt
import re
import scipy.sparse as ss
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.8, min_df=0.01, ngram_range=(1,2))
r_dtm = vectorizer.fit_transform(reviews['cleaned_reviews'])
words = list(vectorizer.get_feature_names())


In [None]:
# Train Topic Model
topic_model = ct.Corex(n_hidden=20, words=words, verbose=False)
topic_model.fit(r_dtm, words=words)

In [None]:
def show_CorEx_topics(model):
    """
    Shows the words associated with the topics in a CoRex Model
    If not anchored, the topic correlation will be sorted in descending order
    """
    topics = model.get_topics()
    for n, topic in enumerate(topics):
        topic_words, _ = zip(*topic) 
        print(f'Topic {n}: TC Score:{model.tcs[n]}: \n', ', '.join(topic_words))
# nlp_vis.show_CorEx_topics(topic_model)
show_CorEx_topics(topic_model)

In [None]:
topic_names_corex = []
for topic in topic_model.get_topics():
    words, _ = zip(*topic)
    curr_names = []
    for i in words[:3]:
        curr_names.append(i)
    topic_names_corex.append(', '.join(curr_names))


In [None]:
topic_names_corex

In [None]:
plt.figure(figsize=(10,5))
plt.bar(range(topic_model.tcs.shape[0]), topic_model.tcs, color='#4e79a7', width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16)
plt.show()

Drop off in correlation around 4 clusters and again at 8 clusters

## Try 8 Topics CorEx

In [None]:
# Train Topic Model
topic_model = ct.Corex(n_hidden=8, words=words, verbose=False)
topic_model.fit(r_dtm, words=words)

In [None]:
r_dtm.shape

In [None]:
topic_names_corex = []
for topic in topic_model.get_topics():
    words, _ = zip(*topic)
    curr_names = []
    for i in words[:3]:
        curr_names.append(i)
    topic_names_corex.append(', '.join(curr_names))


In [None]:
show_CorEx_topics(topic_model)
# Topic Names:

In [None]:
X_corex = pd.DataFrame(topic_model.p_y_given_x, index=reviews.index, columns=topic_names_corex)
y = topic_model.p_y_given_x.argmax(axis=1)

In [None]:
pca = PCA(2)
pca_features = pca.fit_transform(X_corex)

In [None]:
nlp_vis.plot_PCA_2D(pca_features, y, topic_names_corex)

In [None]:
with open('../models/reviews_corex.mdl', 'wb') as towrite:
    pickle.dump(topic_model, towrite)

In [None]:
anchor_words = [['parking', 'crowd'], ['rock', 'rocky'], ['ice', 'snow'], ['lake', 'waterfall', 'pond'], ['easy'], ['hard'], ['bug'], ['family'], ['maintain']]

anchored_topic_model = ct.Corex(n_hidden=10)
anchored_topic_model.fit(r_dtm, words = words, anchors=anchor_words, anchor_strength=3)
show_CorEx_topics(anchored_topic_model)

In [None]:
[word[0] for word in anchor_words]

In [None]:
anchor_corex_results = pd.DataFrame(data=anchored_topic_model.p_y_given_x[:,:9], index=reviews.index, columns=[word[0] for word in anchor_words])

In [None]:
check = reviews[['hike_id', 'cleaned_reviews']].merge(anchor_corex_results, left_index=True, right_index=True)

In [None]:
# reviews = reviews.merge(anchor_corex_results, left_index=True, right_index=True)

In [None]:
check.sample(7)

In [None]:
check.loc[359180]['cleaned_reviews']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
r_corex = reviews.merge(anchor_corex_results, left_index=True, right_index=True)

## Save Reviews with Corex Tabels to CSV

In [None]:
# r_corex.to_csv('../src/reviews_corex.csv')

In [None]:
r_corex.head(7)

-----------------------------------------------------------

In [None]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.8, min_df=0.01, ngram_range=(1,2))
r_dtm = vectorizer.fit_transform(reviews['cleaned_reviews'])
words = list(vectorizer.get_feature_names())

In [None]:
lda_reviews = LatentDirichletAllocation(n_components=5, n_jobs=-1, verbose=True)
lda_reviews.fit(r_dtm)

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_reviews, r_dtm, vectorizer)

In [None]:
lda_reviews.components_.shape

In [None]:
lda_top_results = lda_reviews.transform(r_dtm)

# Try LDA with more Aggressive stop words. 

In [None]:
stop_words = stopwords.words('english')
words_to_add = ['trail', 'hike', 'great']
for word in words_to_add:
    stop_words.append(word)

In [None]:
vectorizer = CountVectorizer(stop_words=stop_words, max_df=0.8, min_df=0.01, ngram_range=(1,2))
r_dtm = vectorizer.fit_transform(reviews['cleaned_reviews'])
words = list(vectorizer.get_feature_names())

In [None]:
lda_reviews = LatentDirichletAllocation(n_components=5, n_jobs=-1, verbose=True)
lda_reviews.fit(r_dtm)