# Spacy Implementation
Adapted from [these](https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/) [tutorials](https://realpython.com/natural-language-processing-spacy-python/)

In [79]:
#imports
import pandas as pd
import numpy as np
import contractions
import re
import spacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#https://stackoverflow.com/questions/6116978/how-to-replace-multiple-substrings-of-a-string
def multiple_replace(string, rep_dict):
    pattern = re.compile("|".join([re.escape(k) for k in sorted(rep_dict,key=len,reverse=True)]), flags=re.DOTALL)
    return pattern.sub(lambda x: rep_dict[x.group(0)], string)

In [55]:
#load in data - this has already been pre-processed according to the 1-data-collection notebook, they need to be combined
df_final = pd.read_csv("./data/routes.csv")

#load in model - don't need the disabled pipelines
nlp = spacy.load('en_core_web_lg',disable=['ner'])

In [80]:
df_final[df_final['lemmatized_text_combined'].str.contains('\n')]

Unnamed: 0,route_name,safety,fa,description,location,protection,grade.YDS,type.trad,metadata.parent_lnglat,metadata.parent_sector,...,plus,minus,plus_minus,grade_numeric_plus_minus,year_established,stratify,grade_reduced,text_combined,lemmatized_text_combined,oov


In [56]:
#combine the nlp features into one all-lowercase column, expand contractions,  remove non-letters
df_final['text_combined'] = df_final.apply(lambda x: " ".join([str(x['description']), 
                                                           str(x['location']), 
                                                           str(x['protection'])]).lower(), 
                                       axis=1)
#drop np.nans and remove str np.nans
df_final = df_final.dropna(subset='text_combined')
df_final['text_combined'] = df_final['text_combined'].apply(lambda x: x.replace("np.nan", ""))

#expand contractions, remove non-letters
df_final['text_combined'] = df_final['text_combined'].map(contractions.fix)
df_final['text_combined'] = df_final['text_combined'].apply(lambda x: x.replace("'s ", " "))
df_final['text_combined'] = df_final['text_combined'].apply(lambda x: re.sub(r'[^a-z\s]', ' ', x))


In [58]:
#lemmatization with stopword and whitespace removal
df_final['lemmatized_text_combined'] = df_final['text_combined'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_stop and not token.is_space]))


In [81]:
df_final['lemmatized_text_combined']

0         go slab bolt steep corner gear leave corner st...
1         fun move break scree fill ledge big bush crux ...
2         pretty cool orange arete sport interesting cli...
3         climb open look slab eastern aspect wall close...
4         good route wall crux move crack thin seam time...
                                ...                        
127508    brush steep slab low corner west face horizont...
127509    friction slab past bolt gear bulge bolt easy r...
127510    face climb right mom meatfloaf step gully low ...
127511    start leave mom meatloaf climb bolt face cross...
127512    begin distinct layback crack split low angle e...
Name: lemmatized_text_combined, Length: 127504, dtype: object

farth = farthest

In [72]:
df_final.shape

(127513, 29)

In [74]:
nans_to_drop = df_final[(df_final['lemmatized_text_combined'] == '') | (df_final['lemmatized_text_combined'] == 'nan')].index
df_final = df_final.drop(nans_to_drop)

In [75]:
train, test = train_test_split(df_final, test_size=.1, random_state=42, stratify=df_final['stratify'])
train, validation = train_test_split(train, test_size=.2, random_state=42, stratify=train['stratify'])

df_final.shape, train.shape, validation.shape, test.shape

((127504, 29), (91802, 29), (22951, 29), (12751, 29))

In [76]:
#save to csv
df_final.to_csv("./data/spacy_routes.csv", index=False)
train.to_csv("./data/spacy_train.csv", index=False)
validation.to_csv("./data/spacy_val.csv", index=False)
test.to_csv("./data/spacy_test.csv", index=False)

In [77]:
#find poor pre-processing and resolve
def oov_tokens(string):
    """
    Takes in the lemmatized version of the combined text and returns all the words in that text that are out-of-vocabulary
    """
    return " ".join([str(token) for token in nlp(string) if token.is_oov])



df_final['oov'] = df_final['lemmatized_text_combined'].apply(oov_tokens)

In [78]:
df_final['oov'].value_counts().head(20)

              83779
bouldery       1750
lieback         963
undercling      900
balancy         813
sidepull        802
cruxy           763
undercle        647
farth           519
chockstone      483
tcus            448
handcrack       434
tricam          428
incut           356
topout          231
permadraw       215
bouldere        178
balancey        172
coldshut        166
laybacke        146
Name: oov, dtype: int64

In [None]:
words_to_replace = {
    "balancey" : 'balancy',
    'bouldere' : 'bouldery',
    'laybacke' : 'lieback',
    'layback' : 'lieback',
    'undercle' : 'undercling',
    
}

df_final['lemmatized_text_combined_cleaned'] = df_final['lemmatized_text_combined'].apply(lambda x: x.replace())

In [None]:
#wordclouds - https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
# Importing wordcloud for plotting word clouds and textwrap for wrapping longer text
from wordcloud import WordCloud
from textwrap import wrap

# Function for generating word clouds
def generate_wordcloud(data,title):
  wc = WordCloud(width=400, height=330, max_words=150,colormap="Dark2").generate_from_frequencies(data)
  plt.figure(figsize=(10,8))
  plt.imshow(wc, interpolation='bilinear')
  plt.axis("off")
  plt.title('\n'.join(wrap(title,60)),fontsize=13)
  plt.show()
  
# Transposing document term matrix
df_dtm=df_dtm.transpose()

# Plotting word cloud for each product
for index,product in enumerate(df_dtm.columns):
  generate_wordcloud(df_dtm[product].sort_values(ascending=False),product)