In [None]:
# setup for google colab

# !pip install spacy==3.1.4
# !python -m spacy download en_core_web_md

# restart runtime before running notebook

In [None]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt

# Similarity with SpaCy

### Load model

We will use meduim English pipeline trained on written web text (blogs, news, comments) with reduced word vector table (20k unique vectors for 685k words).  
https://spacy.io/models/en#en_core_web_md  

More spaCy models can be found at: 
https://spacy.io/models   

We create [text-processing pipeline](https://spacy.io/api/language) by using spacy.load function. 

In [None]:
# load model
nlp = spacy.load("en_core_web_md")

## Document similarity
### Text processing
Class Doc is a container for accessing linguistic annotations, it is a sequence of Token objects. The most common way to get a Doc object is via the nlp object.

In [None]:
query = nlp("happy white woman") 
doc1 = nlp("Two Happy Asian Girls Stock Photo 113778403 : Shutterstock")
doc2 = nlp("Happy Young Woman In White Dress Walking In Spring Park ...")
doc3 = nlp("Happy Couple White Woman Black Man Stock Photo (Edit Now) 47205217")

### Cosine similarity
To check similarity between query and documents (titles from search results) we will use [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity).  
We can use method .similarity to compute a semantic similarity estimate. Documents vector representation is average of their token vectors.  
https://spacy.io/usage/linguistic-features#vectors-similarity

In [None]:
doc1.similarity(query)

In [None]:
doc2.similarity(query)

In [None]:
doc3.similarity(query)

## Load data

In [None]:
df = pd.read_csv('data/dataset.csv')
df.head()

In [None]:
# limit our dataset only to one query 'happy white woman'
dataset = df[df['query'] == 'happy+white+woman']

# use first 30 results for Bing and Google
google_dataset = dataset[dataset['engine']=='google'][:30]
bing_dataset = dataset[dataset['engine']=='bing'][:30]

print("Google dataset size: ", google_dataset.shape[0])
print("Bing dataset size: ", bing_dataset.shape[0])

## Calculate similarity between query and image title

In [None]:
def get_similarity(text, query):
    doc = nlp(text)
    return doc.similarity(nlp(query))

In [None]:
google_dataset['similarity'] = google_dataset['title'].apply(get_similarity, query='happy white woman')
bing_dataset['similarity'] = bing_dataset['title'].apply(get_similarity, query='happy white woman')

### Similarity for Google results

In [None]:
google_dataset['similarity'].describe()

In [None]:
pd.options.display.max_colwidth = 150

google_dataset[google_dataset['similarity'] == google_dataset['similarity'].max()]

In [None]:
google_dataset[google_dataset['similarity'] == google_dataset['similarity'].min()]

### Similarity for Bing results

In [None]:
bing_dataset['similarity'].describe()

In [None]:
bing_dataset[bing_dataset['similarity'] == bing_dataset['similarity'].max()]

In [None]:
bing_dataset[bing_dataset['similarity'] == bing_dataset['similarity'].min()]

### Plot results
We will use scatter plot to present results.

In [None]:
plt.scatter(bing_dataset['baseline'], bing_dataset['similarity'])
plt.scatter(bing_dataset['baseline'], google_dataset['similarity'])
plt.legend(['bing', 'google'])
plt.xlabel("similarity")
plt.ylabel("baseline")
plt.show()

In [None]:
def print_summary(bing_data, google_data, query):
    print('Similarity for query: ', query)
    print("\tMean similarity for Google results (top 5): ", google_data['similarity'][:5].mean())
    print("\tMean similarity for Bing results (top 5): ", bing_data['similarity'][:5].mean())
    print()
    print("\tMean similarity for Google results: ", google_data['similarity'].mean())
    print("\tMean similarity for Bing results: ", bing_data['similarity'].mean())

In [None]:
print_summary(bing_dataset, google_dataset, 'happy white woman')

## Check other queries

In [None]:
a_dataset = df[df['query'] == 'happy+asian+woman'].copy()
b_dataset = df[df['query'] == 'happy+black+woman'].copy()

a_dataset['similarity'] = a_dataset['title'].apply(get_similarity, query='happy asian woman')
b_dataset['similarity'] = b_dataset['title'].apply(get_similarity, query='happy black woman')

a_google_dataset = a_dataset[a_dataset['engine']=='google'][:30]
a_bing_dataset = a_dataset[a_dataset['engine']=='bing'][:30]

b_google_dataset = b_dataset[b_dataset['engine']=='google'][:30]
b_bing_dataset = b_dataset[b_dataset['engine']=='bing'][:30]

In [None]:
print_summary(a_bing_dataset, a_google_dataset, 'happy asian woman')

In [None]:
print_summary(b_bing_dataset, b_google_dataset, 'happy black woman')

## TODO:
* Check similarity between two texts with the same words but in different order. 
* Check how the similarity between two different documents changes with increasing text length. You can try to compare cooking recipes with some sport news.