In [None]:
# setup for google colab

# !pip install spacy==3.1.4
# !python -m spacy download en_core_web_md

# restart runtime before running notebook  

In [None]:
import spacy
from spacy import displacy
import pandas as pd

# Dependency parsing with SpaCy

### Load model

In [None]:
nlp = spacy.load("en_core_web_md")

## Load and filter data

In [None]:
df = pd.read_csv('data/dataset.csv')
dataset = df[df['query'] == 'happy+white+woman']
dataset.sample(10)

In [None]:
google_dataset = dataset[dataset['engine']=='google'].copy()
bing_dataset = dataset[dataset['engine']=='bing'].copy()

In [None]:
print("Google dataset size: ", google_dataset.shape[0])
print("Bing dataset size: ", bing_dataset.shape[0])

In [None]:
bing_dataset = bing_dataset[:78]
print("Bing dataset size: ", bing_dataset.shape[0])

### Most common words
We will use word cloud to visualize most frequent words.  
https://amueller.github.io/word_cloud/generated/wordcloud.WordCloud.html

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import itertools
from collections import Counter

In [None]:
def get_words(text):
    words = []
    doc = nlp(text)
    for token in doc:
        if token.is_alpha and not token.is_stop:
            words.append(token.lemma_.lower())
    return words

            
google_words = list(itertools.chain.from_iterable(google_dataset['title'].apply(get_words)))
bing_words = list(itertools.chain.from_iterable(bing_dataset['title'].apply(get_words)))

google_words_cnt = Counter(google_words)
bing_words_cnt = Counter(bing_words)

In [None]:
google_words_cnt.most_common(10)

In [None]:
bing_words_cnt.most_common(10)

In [None]:
plt.rcParams["figure.figsize"] = (20,10)

def draw_word_cloud(words_cnt):
    word_cloud = WordCloud(background_color="white", width=800, height=400).fit_words(words_cnt)
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
draw_word_cloud(google_words_cnt)

In [None]:
draw_word_cloud(bing_words_cnt)

## Visualize dependencies 
https://spacy.io/usage/visualizers#dep

In [None]:
options={'compact': True, 'distance': 90}

doc = nlp('"Happy White Woman Pregnant, Black Man. Stock Photo - Image of ..."')
displacy.render(doc, style="dep", options=options)

In [None]:
doc = nlp('"Happy Woman With Big Smile, Studio White Background ..."')
displacy.render(doc, style="dep", options=options)

## Get dependency and dependency heads for word 'white'
https://spacy.io/api/dependencyparser  
https://spacy.io/usage/linguistic-features#dependency-parse

In [None]:
# function to get dependency type for word in text 
def get_dep_type(text, word):
    deps = []
    doc = nlp(text)
    for token in doc:
        if token.text.lower() == word.lower():
            deps.append(token.dep_)
    return deps

# function to get dependency head for word in text 
def get_dep_head(text, word):
    heads = []
    doc = nlp(text)
    for token in doc:
        if token.text.lower() == word.lower():
            heads.append(token.head.text.lower())
    return heads

In [None]:
google_dataset['deps'] = google_dataset['title'].apply(get_dep_type, word='white')
google_dataset['deps head'] = google_dataset['title'].apply(get_dep_head, word='white')

bing_dataset['deps'] = bing_dataset['title'].apply(get_dep_type, word='white')
bing_dataset['deps head'] = bing_dataset['title'].apply(get_dep_head, word='white')

In [None]:
# Check dependency types for word white in titles
deps = list(itertools.chain.from_iterable(google_dataset['deps']))
Counter(deps)

In [None]:
deps = list(itertools.chain.from_iterable(bing_dataset['deps']))
Counter(deps)

In [None]:
# Count dependency heads for word white in titles 
bing_heads = list(itertools.chain.from_iterable(bing_dataset['deps head']))
bing_cnt = Counter(bing_heads)
bing_cnt.most_common()

In [None]:
google_heads = list(itertools.chain.from_iterable(google_dataset['deps head']))
google_cnt = Counter(google_heads)
google_cnt.most_common()

### Plot results
We will use bar plot to show most common heads.

In [None]:
cnt = google_cnt + bing_cnt
labels = [k for k, v in cnt.most_common(15)]
google_values = [google_cnt[lbl] for lbl in labels]
bing_values = [bing_cnt[lbl] for lbl in labels]

In [None]:
import matplotlib.pyplot as plt
import numpy as np


x = np.arange(len(labels))
width = 0.4

fig, ax = plt.subplots()
ax.bar(x-width/2, google_values, width=width, label='google')
ax.bar(x+width/2, bing_values, width=width, label='bing')
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=15, rotation=45)
ax.legend(fontsize=20)
ax.set_title("Heads count for word 'white'", fontsize=15)

plt.show()

## Check other queries

In [None]:
a_dataset = df[df['query'] == 'happy+asian+woman'].copy()
b_dataset = df[df['query'] == 'happy+black+woman'].copy()

a_dataset['deps head'] = a_dataset['title'].apply(get_dep_head, word='asian')
b_dataset['deps head'] = b_dataset['title'].apply(get_dep_head, word='black')

a_heads = list(itertools.chain.from_iterable(a_dataset['deps head']))
b_heads = list(itertools.chain.from_iterable(b_dataset['deps head']))

In [None]:
a_cnt = Counter(a_heads)
a_cnt.most_common()

In [None]:
b_cnt = Counter(b_heads)
b_cnt.most_common()

## TODO:
* Check what noun phrases we can find in documents. Use doc.noun_chunks iterator.  
https://spacy.io/api/doc#noun_chunks