# Using `spaCy` for cosine similarity

Prior to running this code, it's necessary to install `spaCy` on your machine, and also to download its English libraries. 

In [1]:
import spacy
import pandas as pd
from spacy.tokens import Doc
from spacy.vocab import Vocab

In [2]:
# Load the pre-defined English model:
nlp = spacy.load('en_core_web_sm')

In [3]:
# Read in a CSV file with a column of text abstracts.
df = pd.read_csv('resources/fedreg_18-05-22-14-45.csv')

#### Prepare the text data for processing

In [4]:
df=df[['document_number','abstract']] # Keep only the columns we need 
df=df.head(20) # Trim the dataset down to size, for example purposes
df=df.dropna(how='any') # Drop any rows with missing data
df['abstract_utf']=df['abstract'].apply(lambda x: x.decode('utf-8')) # Convert the text to UTF8 format
df.head()

AttributeError: 'str' object has no attribute 'decode'

In [None]:
# Preprocess and vectorize the text column.
df['tokens'] = df['abstract_utf'].apply(lambda x: nlp(x))

In [None]:
# Display POS tagging for first abstract.
spacy.displacy.render(df['tokens'][1], style='ent',jupyter=True)

#### Note that the 4th and 5th abstracts are similar but not identical. We would expect these to have a high cosine similarity score.

In [None]:
print(df['abstract'][0])
print('\n')
print(df['abstract'][3])
print('\n')
print(df['abstract'][4])
print('\n')
print(df['abstract'][5])

In [None]:
# Assign variable names.
doc0=df['tokens'][0]
doc3=df['tokens'][3]
doc4=df['tokens'][4]
doc5=df['tokens'][5]

In [None]:
# As expected, abstracts 4 and 5 are highly similar.
print(doc4.similarity(doc5)) 

In [None]:
# Abstracts 4 and 3 are somewhat similar.
print(doc4.similarity(doc3)) 

In [None]:
# Abstracts 4 and 0 are not really very similar.
print(doc4.similarity(doc0)) 