# Spacy Implementation
Adapted from [these](https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/) [tutorials](https://realpython.com/natural-language-processing-spacy-python/)

In [9]:
#imports
import pandas as pd
import numpy as np
import contractions
import re
import spacy



In [23]:
#load in data
df_final = pd.read_csv("./data/routes.csv")

#load in model - don't need the disabled pipelines
nlp = spacy.load('en_core_web_lg',disable=['parser', 'ner'])

In [46]:
#combine the nlp features into one all-lowercase column, expand contractions,  remove non-letters
df_final['text_combined'] = df_final.apply(lambda x: " ".join([str(x['description']), 
                                                           str(x['location']), 
                                                           str(x['protection'])]).lower(), 
                                       axis=1)
#drop np.nans and remove str np.nans
df_final = df_final.dropna(subset='text_combined')
df_final['text_combined'] = df_final['text_combined'].apply(lambda x: x.replace("np.nan", ""))

#expand contractions, remove non-letters
df_final['text_combined'] = df_final['text_combined'].map(contractions.fix)
df_final['text_combined'] = df_final['text_combined'].apply(lambda x: x.replace("'s ", " "))
df_final['text_combined'] = df_final['text_combined'].apply(lambda x: re.sub(r'[^a-z\s]', ' ', x))


In [48]:
#lemmatization with stopword and whitespace removal
df_final['lemmatized_text_combined'] = df_final['text_combined'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop and not token.is_space]))


In [34]:
df_final.loc[0, 'lemmatized_text_combined']

str

In [49]:
#find poor pre-processing and resolve
def oov_tokens(string):
    """
    Takes in the lemmatized version of the combined text and returns all the words in that text that are out-of-vocabulary
    """
    return " ".join([str(token) for token in nlp(string) if token.is_oov])



df_final['oov'] = df_final['lemmatized_text_combined'].apply(oov_tokens)

In [51]:
df_final['oov'].value_counts().head(20)

              83788
bouldery       1750
lieback         963
undercling      900
balancy         813
sidepull        802
cruxy           763
undercle        647
farth           519
chockstone      483
tcus            448
handcrack       434
tricam          428
incut           356
topout          231
permadraw       215
bouldere        178
balancey        172
coldshut        166
laybacke        146
Name: oov, dtype: int64

farth = farthest

In [None]:
#wordclouds - https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
# Importing wordcloud for plotting word clouds and textwrap for wrapping longer text
from wordcloud import WordCloud
from textwrap import wrap

# Function for generating word clouds
def generate_wordcloud(data,title):
  wc = WordCloud(width=400, height=330, max_words=150,colormap="Dark2").generate_from_frequencies(data)
  plt.figure(figsize=(10,8))
  plt.imshow(wc, interpolation='bilinear')
  plt.axis("off")
  plt.title('\n'.join(wrap(title,60)),fontsize=13)
  plt.show()
  
# Transposing document term matrix
df_dtm=df_dtm.transpose()

# Plotting word cloud for each product
for index,product in enumerate(df_dtm.columns):
  generate_wordcloud(df_dtm[product].sort_values(ascending=False),product)