In [None]:
# setup for google colab

# !pip install spacy==3.1.4
# !pip install spacy-transformers
# !python -m spacy download en_core_web_trf

# restart runtime before running notebook

In [None]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Transformers with SpaCy
Pipeline based on RoBERTa: 
https://spacy.io/models/en#en_core_web_trf

* [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
* [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
* [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692)

### Load model

In [None]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf")

In [None]:
query = nlp("happy white woman") 
doc1 = nlp("Happy Young  Woman In White Dress Walking In Spring Park in New York...")
doc2 = nlp("Happy Couple White Woman Black Man Stock Photo (Edit Now) 47205217")

### Word vectors

In [None]:
doc1._.trf_data.tokens

In [None]:
query._.trf_data.tensors

In [None]:
# token vectors
query._.trf_data.tensors[0].shape

In [None]:
# doc vector
query._.trf_data.tensors[1].shape

### Load dataset

In [None]:
dataset = pd.read_csv('data/dataset.csv')
dataset = dataset[dataset['query'] == 'happy+white+woman']
dataset = dataset.append({'title': 'happy white woman', 'engine': 'q'}, ignore_index=True)

### Word embeddings
BERT based models can produce different vector representations for the same word based on context. We will use embeddings for word 'white'. 

In [None]:
def get_word_vector(text, word):
    doc = nlp(text)
    vector = [0]*768
    for i, token in enumerate(doc._.trf_data.tokens['input_texts'][0]):
        if word in token.lower():
            vector = doc._.trf_data.tensors[0][0][i]
    return vector

In [None]:
dataset['vector white'] = dataset['title'].apply(get_word_vector, word='white')

## Cosine similarity
We will use cosine similaruty to compare vector representations for word 'white'.  
https://deepai.org/machine-learning-glossary-and-terms/cosine-similarity

In [None]:
def cos_sim(v1, v2):
    return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

In [None]:
query_vector = dataset[dataset['engine']=='q']['vector white'].iloc[0]

In [None]:
dataset['similarity'] = dataset['vector white'].apply(cos_sim, v2=query_vector)

google_dataset = dataset[dataset['engine']=='google'][:30]
bing_dataset = dataset[dataset['engine']=='bing'][:30]

In [None]:
google_dataset.head()

In [None]:
bing_dataset.head()

#### Plot results

In [None]:
plt.scatter(bing_dataset['baseline'], bing_dataset['similarity'])
plt.scatter(google_dataset['baseline'], google_dataset['similarity'])
plt.legend(['bing', 'google'])
plt.xlabel("baseline")
plt.ylabel("similarity")
plt.show()

In [None]:
def print_summary(bing_data, google_data, query):
    print('Similarity for word vectors')
    print("\tMean similarity for Google results (top 5): ", google_data['similarity'][:5].mean())
    print("\tMean similarity for Bing results (top 5): ", bing_data['similarity'][:5].mean())
    print()
    print("\tMean similarity for Google results (top 30): ", google_data['similarity'].mean())
    print("\tMean similarity for Bing results (top 30): ", bing_data['similarity'].mean())

    
print_summary(bing_dataset, google_dataset, 'happy white woman')

## PCA
To reduce dimensionality we can use PCA. After reduction we can plot word embeddings on 2D scatter plot.  
https://builtin.com/data-science/step-step-explanation-principal-component-analysis

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
pca = PCA(n_components=2)

standardized = StandardScaler().fit_transform(dataset['vector white'].tolist())
dataset[['PC 1', 'PC 2']] = pca.fit_transform(standardized)

dataset.head()

In [None]:
google_dataset = dataset[dataset['engine']=='google']
bing_dataset = dataset[dataset['engine']=='bing']
query = dataset[dataset['engine']=='q']

plt.scatter(bing_dataset['PC 1'], bing_dataset['PC 2'])
plt.scatter(google_dataset['PC 1'], google_dataset['PC 2'])
plt.scatter(query['PC 1'], query['PC 2'])

plt.legend(['bing', 'google', 'query'])
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.show()

## TODO: 
* Check spaCy - sentence-transformers if you want to check similarity between two documents.  
https://spacy.io/universe/project/spacy-sentence-bert