## Benchmark

*This notebook implements a variety of algorithms, and checks to see how they work on a set of test images.*

**Flickr API**

*Import flickr functionality and record credentials*

In [1]:
import flickrapi
import json
import pprint
import pandas as pd
from textblob import TextBlob
import io
from google.cloud import vision
from google.cloud.vision import types
from PIL import Image, ImageDraw
import os

#pp = pprint.PrettyPrinter(indent=4)

### Establish connections to Flickr and Google

Establish Flickr connection

In [5]:
api_key = u'37528c980c419716e0879a417ef8211c'
api_secret = u'41075654a535c203'

# establish connection
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

Establish Google connection

In [4]:
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \
#"/Users/ctoews/Documents/Insight/Project/googleAPI/MyFirstProject-76680dcd1ad6.json"

def explicit():
    from google.cloud import storage

    # Explicitly use service account credentials by specifying the private key
    # file.
    storage_client = storage.Client.from_service_account_json(
        '/Users/ctoews/Documents/Insight/Project/googleAPI/MyFirstProject-76680dcd1ad6.json')

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)

# authenticate google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \
"/Users/ctoews/Documents/Insight/Project/googleAPI/MyFirstProject-76680dcd1ad6.json"
    
explicit()

client = vision.ImageAnnotatorClient()
image = types.Image()

[<Bucket: toews-images>]


### Get data

In [6]:
def assemble_urls(photoset):
    urls = []
    for photo in photoset['photoset']['photo']:
        url = "https://farm" + str(photo['farm']) + ".staticflickr.com/" + photo['server'] + "/" + \
              photo['id'] + "_" + photo['secret'] + ".jpg"
        urls.append(url)    
    return urls
    
# get bad photo ids
badset   = flickr.photosets.getPhotos(user_id='138072685@N02',photoset_id='72157690932631201')
goodset   = flickr.photosets.getPhotos(user_id='138072685@N02',photoset_id='72157690932695551')
bad_urls = assemble_urls(badset)
good_urls = assemble_urls(goodset)

In [7]:
bad_urls

['https://farm5.staticflickr.com/4613/25993550898_065d0b3880.jpg',
 'https://farm5.staticflickr.com/4764/28087291009_3d20f9a4a2.jpg',
 'https://farm5.staticflickr.com/4619/28087290939_6d2f4261b4.jpg',
 'https://farm5.staticflickr.com/4717/25993550618_14c87ffa16.jpg',
 'https://farm5.staticflickr.com/4714/28087290769_b7df9499e8.jpg',
 'https://farm5.staticflickr.com/4626/28087290719_61ed3cca32.jpg',
 'https://farm5.staticflickr.com/4625/28087290709_4dffc807d3.jpg',
 'https://farm5.staticflickr.com/4719/28087290669_a2beb02023.jpg',
 'https://farm5.staticflickr.com/4751/28087291099_93066c995e.jpg',
 'https://farm5.staticflickr.com/4629/28087290529_9c762237c2.jpg']

*Authenticate*

In [8]:
from google.cloud import storage

# Explicitly use service account credentials by specifying the private key
# file.
storage_client = storage.Client.from_service_account_json(
    '/Users/ctoews/Documents/Insight/Project/googleAPI/MyFirstProject-76680dcd1ad6.json')

# Make an authenticated API request
buckets = list(storage_client.list_buckets())
print(buckets)

[<Bucket: toews-images>]


*Pass photo URLs to Google Vision for labelling*

In [9]:
bad_labels = []
for url in bad_urls:
    image.source.image_uri = url
    response = client.label_detection(image=image)
    labels = response.label_annotations
    these_labels = ''
    for label in labels:
        these_labels += (label.description + ' ')
    bad_labels.append(these_labels)
    
good_labels = []
for url in good_urls:
    image.source.image_uri = url
    response = client.label_detection(image=image)
    labels = response.label_annotations
    these_labels = ''
    for label in labels:
        these_labels += (label.description + ' ')
    good_labels.append(these_labels)
    
bl = pd.DataFrame(bad_labels,columns=['labels'])
gl = pd.DataFrame(good_labels,columns=['labels'])

In [12]:
image.source.image_uri = url
response = client.label_detection(image=image)
labels = response.label_annotations

['', '', '', '', '', '', '', '', '', '']

In [13]:
good_urls

['https://farm5.staticflickr.com/4623/39834715572_1559b597ec.jpg',
 'https://farm5.staticflickr.com/4605/39834715692_e499c7d71f.jpg',
 'https://farm5.staticflickr.com/4630/39834715602_3314a7eaf4.jpg',
 'https://farm5.staticflickr.com/4653/39834716592_efe5420940.jpg',
 'https://farm5.staticflickr.com/4674/39834715812_c9b8157bc5.jpg',
 'https://farm5.staticflickr.com/4708/39834715942_d993de82f6.jpg',
 'https://farm5.staticflickr.com/4673/39834716042_ae01ea0ceb.jpg',
 'https://farm5.staticflickr.com/4699/39834716362_1c539bed39.jpg',
 'https://farm5.staticflickr.com/4611/39834716422_36a95d3667.jpg',
 'https://farm5.staticflickr.com/4723/39834716482_00c2ce1e07.jpg']

In [None]:
good_sentiment=[]
for i in np.arange(10):
    doc = TextBlob(good_labels[i])
    #print(doc.sentiment[0])
    good_sentiment.append(doc.sentiment)
    
bad_sentiment=[]
for i in np.arange(10):
    doc = TextBlob(bad_labels[i])
    #print(doc.sentiment[0])
    bad_sentiment.append(doc.sentiment)

In [None]:
np.mean(bad_sentiment)

In [None]:
all_labels=pd.concat([bl,gl])
for i in all_labels['labels']:
    print(i)

**Match to poems**

In [None]:
import pandas as pd
import spacy
import pickle
import poeml_utility as pml

parser = spacy.load('en')

In [None]:
parser = spacy.load('en')
allvecs = pd.read_pickle('allvecs.pkl')
with open('sharespeares_stopwords.pkl','rb') as file:
    shakespeares_stopwords = pickle.load(file)

In [None]:
from collections import Counter, OrderedDict
from nltk.corpus import stopwords
from nltk import SnowballStemmer
import string
# A custom stoplist
STOPLIST = set(stopwords.words('english') + list(shakespeares_stopwords))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + \
          ["-----", "---", "...", "“", "”", "'s"]

In [None]:
# strip blanks and other terrible things
data = all_labels['labels']
data_clean=[]
for label in data:
    data_clean.append(pml.cleanText(label))

In [None]:
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() 
                      if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")
    
    return tokens


# tokenize
label_token = []
for label in data_clean:
    label_token.append(tokenizeText(label))

In [None]:
# recombine
input_label = []
for label in label_token:
    input_label.append(' '.join(label))

In [None]:
import sqlalchemy # pandas-mysql interface library
import sqlalchemy.exc # exception handling
import poeml_utility as pml
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances, cosine_similarity
from sklearn import preprocessing
from sklearn.preprocessing import normalize

engine = pml.connect_db()

In [None]:
#parse
parsed_labels = []
for label in all_labels['labels']:
    parsed_labels.append(parser(label))

In [None]:
parsed_labels

In [None]:
# calculate the embeddings for the picture labels

good_pics_vecs = np.zeros((10,384))
for i in np.arange(10):
    good_pics_vecs[i,:] = parser(str(parsed_labels[i])).vector
    
bad_pics_vecs = np.zeros((10,384))
for i in np.arange(10):
    bad_pics_vecs[i,:] = parser(str(parsed_labels[10+i])).vector

In [None]:
query = "select * from sonnet_sentences order by index;"
sonnet_sentences = pd.read_sql(query,engine)
len(sonnet_sentences)

In [None]:
query = "select * from poem_embeddings order by index;"
poem_embeddings = pd.read_sql(query,engine)
poem_embeddings.shape

In [None]:
# identify test cases
bad_idx = 282
good_idx= 35

# extract relevant embeddings
bad_vec = poem_embeddings.iloc[bad_idx,1:]
good_vec = poem_embeddings.iloc[good_idx,1:]

# check
print("bad: \n",sonnet_sentences.iloc[bad_idx,2])
print("good: \n",sonnet_sentences.iloc[good_idx,2])

In [None]:
bb = cosine_distances(bad_vec.values.reshape((1,-1)), bad_pics_vecs).flatten()
bg = cosine_distances(bad_vec.values.reshape((1,-1)), good_pics_vecs).flatten()
gb = cosine_distances(good_vec.values.reshape((1,-1)), bad_pics_vecs).flatten()
gg = cosine_distances(good_vec.values.reshape((1,-1)), good_pics_vecs).flatten()
test_results = pd.DataFrame(data={'bb':bb,'bg':bg,'gb':gb,'gg':gg})

In [None]:
print("bad poem: \n",np.sign(test_results['bb']-test_results['bg']))
print("good poem: \n",np.sign(test_results['gg']-test_results['gb']))

In [None]:
bb = euclidean_distances(bad_vec.values.reshape((1,-1)), bad_pics_vecs).flatten()
bg = euclidean_distances(bad_vec.values.reshape((1,-1)), good_pics_vecs).flatten()
gb = euclidean_distances(good_vec.values.reshape((1,-1)), bad_pics_vecs).flatten()
gg = euclidean_distances(good_vec.values.reshape((1,-1)), good_pics_vecs).flatten()
test_results = pd.DataFrame(data={'bb':bb,'bg':bg,'gb':gb,'gg':gg})

In [None]:
print("bad poem: \n",np.sign(test_results['bb']-test_results['bg']))
print("good poem: \n",np.sign(test_results['gg']-test_results['gb']))

In [None]:
bg

In [None]:
bad_pics_vecs = normalize(bad_pics_vecs,axis=1)
good_pics_vecs = normalize(good_pics_vecs,axis=1)

In [None]:
del poem_embeddings['index']

In [None]:
dists=cosine_distances(good_vec.reshape((1,-1)),poem_embeddings)

In [None]:
idx=np.argsort(dists)
dists[0,idx[0][0:5]]

In [None]:
dists[0,idx[0][0:5]]

In [None]:
parser

In [None]:
x=parser("god")

In [None]:
x.vector

### Play with new testset

In [None]:
test_images_url = "https://www.flickr.com/photos/138072685@N02/albums"

In [None]:
test_flickr   = flickr.photosets.getPhotos(user_id='138072685@N02',photoset_id='72157669045554809')


In [None]:
test_urls = assemble_urls(test_flickr)


In [None]:
test_labels = []
for url in test_urls:
    image.source.image_uri = url
    response = client.label_detection(image=image)
    labels = response.label_annotations
    these_labels = ''
    for label in labels:
        these_labels += (label.description + ' ')
    test_labels.append(these_labels)

In [None]:
test_labels

In [None]:
query = "select * from quotes;"
quotes = pd.read_sql(query,engine)
quotes.quoteText

In [None]:
quotevecs = pd.read_pickle('quote_vecs.pkl')

In [None]:
quotes.loc[quotes.quoteText.str.contains('sun'),:]

In [None]:
q1 = quotes.iloc[1362,:]
q2 = quotes.iloc[987,:]
q1v = parser(q1.quoteText).vector
q2v = parser(q2.quoteText).vector

In [None]:
import spacy
parser = spacy.load('en')

In [None]:
image_text = test_labels[-1]
image_text
image_vector = parser(image_text)
image_vector=image_vector.vector

In [None]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances, cosine_similarity


In [None]:
cosine_similarity(image_vector.reshape(1,-1),q2v.reshape(1,-1))

In [None]:
np.array(q1v.reshape(1,-1),q2v.reshape(1,-1)).shape

In [None]:
q1v.reshape(1,-1).shape