## Test Revised Code

The purpose of this notebook is to make sure my revised PoeML app actually works.  

In [None]:
def get_path_and_file_names():
    root_dir = "/Users/ctoews/Documents/Insight/app_demo"
    api_dir = "/flaskexample/static/api"
    pkl_dir = "/flaskexample/static/pkl"
    api_file = "/MyFirstProject-76680dcd1ad6.json"
    poem_file = "df1_smallpoems.pkl"
    vec_file = "df1_vecs.pkl"
    vectorizer_file = "d1_vectorizer_replacement.pkl"
    return root_dir, api_dir, pkl_dir, api_file, poem_file, vec_file, vectorizer_file

 
def get_runtime_parameters():
    n_matches_per_photo = 3 # maximum number of images to return
    lam = .1        # regularization parameter
    batch = False    # use averaging technique to handle multiple images
    return n_matches_per_photo, lam, batch


def get_pkl_files(root_dir,pkl_dir,poem_file,vec_file,vectorizer_file):
    import pickle
    import pandas as pd
    df_poems = pd.read_pickle(root_dir + pkl_dir + '/' + poem_file)
    df_vecs =   pd.read_pickle(root_dir + pkl_dir + '/' + vec_file)
    vectorizer = pickle.load( open( root_dir + pkl_dir + '/' + vectorizer_file, "rb" ) )
    return df_poems, df_vecs, vectorizer



def get_stopwords():
    from nltk.corpus import stopwords
    import string
    STOPLIST = stopwords.words('english')
    SYMBOLS = " ".join(string.punctuation).split(" ") + \
              ["-----", "--", "---", "...", "“", "”", "'s"] + list(string.digits)
    return STOPLIST, SYMBOLS


def tokenizeText(sample):

    import spacy
    global STOPLIST
    global SYMBOLS

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip()
                      if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    STOPWORDS, SYMBOLS = get_stopwords()

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens



# extract image urls from information in photoset object (returned from Flickr api call)
def assemble_urls(photoset):
    urls = []
    for photo in photoset['photoset']['photo']:
        url = "https://farm" + str(photo['farm']) + ".staticflickr.com/" + photo['server'] + "/" + \
              photo['id'] + "_" + photo['secret'] + ".jpg"
        urls.append(url)
    return urls



# extact userid and albumid from Flickr album url (used to form image urls)
def parse_url(url):

    import re

    try:
        userid = re.search('photos/(.+?)/', url).group(1)
    except AttributeError:
        # AAA, ZZZ not found in the original string
        userid = '' # apply your error handling

    try:
        albumid = re.search('albums/(.*)', url).group(1)
    except AttributeError:
        albumid = '' # apply your error handling

    return userid, albumid


def get_flickr_urls(url):

    import flickrapi

    #import flickr_keys
    api_key = u'37528c980c419716e0879a417ef8211c'
    api_secret = u'41075654a535c203'

    # establish connection
    flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

    # extract user and album id
    userid, albumid = parse_url(url)

    #fetch album info
    albuminfo  = flickr.photosets.getPhotos(user_id=userid,photoset_id=albumid)

    # extract individual photo urls
    photo_urls = assemble_urls(albuminfo)

    return photo_urls



def get_photo_urls(url):
    # input could be a Flickr photo album url
    if 'www.flickr.com/photos/' in url:
        photo_urls = get_flickr_urls(url)

    # or a list of image jpegs
    else:
        photo_urls = url.split(',')

    return photo_urls



# connect to google api
def explicit(root_dir, api_dir, api_file):
    from google.cloud import storage
    # Explicitly use service account credentials by specifying the private key
    # file.
    storage_client = storage.Client.from_service_account_json(
        root_dir + api_dir + '/' + api_file)

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)



def get_labels_for_remote_images(photo_urls, root_dir, api_dir, api_file):
    import os
    from google.cloud import vision
    from google.cloud.vision import types
    import pandas as pd
    # authenticate
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \
        root_dir+ api_dir + '/' + api_file
    explicit(root_dir, api_dir, api_file)

    # connect to Google api
    client = vision.ImageAnnotatorClient()
    image = types.Image()

    # feed photo url to Google, extract label
    all_labels = []
    for url in photo_urls:
        image.source.image_uri = url
        response = client.label_detection(image=image)
        labels = response.label_annotations
        these_labels = ''
        for label in labels:
            these_labels += (label.description + ' ')
        all_labels.append(these_labels)

    # store labels as dataframe
    df_all_labels = pd.DataFrame({'keywords':all_labels,'url':photo_urls})

    # eliminate any photo that came back with zero labels
    df_all_labels = df_all_labels.loc[df_all_labels.keywords.apply(lambda x: len(x))!=0]

    return df_all_labels


def get_labels_for_local_images(photo_urls, root_dir, api_dir, api_file):
    """This function will need to be changed...doesn't currently work"""
    import os
    from google.cloud import vision
    from google.cloud.vision import types

    # authenticate
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \
        root_dir + api_dir + '/' + api_file

    explicit(root_dir, api_dir, api_file)

    # connect to Google api
    client = vision.ImageAnnotatorClient()
    image = types.Image()

    # feed photo url to Google, extract label
    all_labels = []
    for url in photo_urls:
        image.source.image_uri = url
        response = client.label_detection(image=image)
        labels = response.label_annotations
        these_labels = ''
        for label in labels:
            these_labels += (label.description + ' ')
        all_labels.append(these_labels)

    # store labels as dataframe
    all_labels = pd.DataFrame(all_labels,columns=['labels'])

    return all_labels



def extract_n_top_words_from_poem(poem_vector,feature_names):
    import numpy as np

    # adjust as necessary
    ntopwords = 10

    # rank keywords by tf-idf weight
    indices = poem_vector.indices
    rank_idx = poem_vector.data.argsort()[:-ntopwords:-1]

    # form list of such words and return it, along with weights
    keywords = [feature_names[indices[i]] for i in rank_idx]
    weights = [poem_vector.data[i] for i in rank_idx]

    return keywords, np.array(weights)


# transform the image labels with the vectorizer
def weight_labels(df_all_labels, vectorizer):
    import spacy
    import pandas as pd

    image_words = []
    image_weights = []
    feature_names = vectorizer.get_feature_names()

    # the vectorizer seems to need to have access to the parser, probably for the tokenizing step
    parser = spacy.load('en')

    for row in vectorizer.transform(df_all_labels['keywords'].tolist()):
        kw, wt = extract_n_top_words_from_poem(row,feature_names)
        image_words.append(kw)
        image_weights.append(wt)

    df_images = df_all_labels
    df_images['keywords'] = image_words
    df_images['weights'] = image_weights

    # eliminate any photo that came back with zero labels
    df_images = df_images.loc[df_images.keywords.apply(lambda x: len(x))!=0]

    return df_images



def images2vec(df_images):
    import spacy
    import pandas as pd
    import numpy as np

    # load parser, to be used with vectorizer
    parser = spacy.load('en')

    image_vectors = np.zeros((len(df_images),384))
    j=0
    for row in df_images.itertuples():
        keywords = row.keywords
        weights = row.weights
        vecs = np.zeros((len(keywords),384))
        i = 0
        for k in keywords:
            vecs[i,:] = parser(k).vector
            i+=1
        image_vectors[j,:]=np.dot(weights,vecs)
        j+=1

    return image_vectors



def find_best_match(image_vectors, poem_vectors, image_sentiment, poem_sentiment,n_matches_per_photo=3,batch=True,lam=.1):
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity

    # find poem that maximizes a sentiment-regularized objective function
    if batch:
        image_vectors = np.mean(image_vectors,axis=0).reshape(1,384)
        image_sentiment = [np.mean(image_sentiment)]

    # assess the cosine similarity for each image/poem pair
    sim = cosine_similarity(image_vectors,poem_vectors)

    # also calculate the difference in sentiment score
    dif = np.array([np.abs((im_s - poem_sentiment)) for im_s in image_sentiment])

    # the net score is a weighted difference
    net = sim - lam*dif

    ix = net.argsort(axis=1)[:,:-n_matches_per_photo-1:-1]
    scores = np.array([ list(net[i,ix[i,:]]) for i in range(len(ix))])

    return ix, scores



def gather_results(ix,scores,df_images,df_poems,photo_urls):
    import pandas as pd
    import numpy as np
    # gather top N poems (for each picture, or for the "average" picture)
    results = pd.DataFrame([ df_poems.loc[ix[i,:],'poem'].tolist() for i in range(len(ix))],\
                           columns = [str(i) for i in range(1,ix.shape[1]+1)])

    # collect image urls and keywords
    if len(results) == len(df_images):
        results[['url','keywords','weights','sentiment']] = \
        df_images[['url','keywords','weights','sentiment']]

    # if in batchmode, collect images with the most keywords
    else:
        ix = np.argmax(df_images.keywords.apply(lambda x: len(x)))
        results['url']= df_images.loc[ix,'url']
        results['keywords']= [df_images.loc[ix,'keywords']]
        results['weights']=[df_images.loc[ix,'weights']]
        results['sentiment']= df_images.loc[ix,'sentiment']

    return results


def ModelIt(url):

    from PIL import Image, ImageDraw
    import pandas as pd
    import spacy
    import numpy as np
    from textblob import TextBlob

    # load up path and file names, as well as runtime parameters
    root_dir, api_dir, pkl_dir, api_file, poem_file, vec_file, vectorizer_file =\
        get_path_and_file_names()
    n_matches_per_photo, lam, batch = get_runtime_parameters()

    # some of the larger data structures are stored in binary form, to expedite runtime
    df_poems, df_vecs, vectorizer = get_pkl_files(root_dir,pkl_dir,poem_file,vec_file,vectorizer_file)
    poem_vectors = df_vecs.values

    # Set the variable "photo_urls", which is a list of urls of all images
    photo_urls = get_photo_urls(url)

    # Connect to Google-Cloud-Vision API and extract labels for each image
    df_all_labels = get_labels_for_remote_images(photo_urls, root_dir, api_dir, api_file)

    # weight the keywords by the vectorizer used to process the poetry text
    df_images = weight_labels(df_all_labels, vectorizer)

    # append sentiment analysis for each image
    df_images['sentiment'] = [TextBlob(' '.join(x)).sentiment[0] for x in df_images.keywords]

    # if after extracting and weighting labels, nothing remains, exit gracefully
    if len(df_images)==0:
        return -1

    # otherwise, embed image vectors via word2vec
    image_vectors = images2vec(df_images)

    # return sorted scores
    ix, scores = find_best_match(image_vectors, poem_vectors, df_images['sentiment'], df_poems['sentiment'],batch=batch)

    # gather all relevant info into a dataframe
    results = gather_results(ix,scores,df_images,df_poems,photo_urls)


In [None]:
from PIL import Image, ImageDraw
import pandas as pd
import spacy
import numpy as np
from textblob import TextBlob

# load up path and file names, as well as runtime parameters
root_dir, api_dir, pkl_dir, api_file, poem_file, vec_file, vectorizer_file =\
get_path_and_file_names()
n_matches_per_photo, lam, batch = get_runtime_parameters()
photo_urls = get_photo_urls(url)
df_all_labels = get_labels_for_remote_images(photo_urls, root_dir, api_dir, api_file)
df_images = weight_labels(df_all_labels, vectorizer)
df_images['sentiment'] = [TextBlob(' '.join(x)).sentiment[0] for x in df_images.keywords]
image_vectors = images2vec(df_images)
ix, scores = find_best_match(image_vectors, poem_vectors, df_images['sentiment'], df_poems['sentiment'],batch=batch)

In [None]:
results = gather_results(ix,scores,df_images,df_poems,photo_urls)

In [None]:
df_all_labels

In [None]:
df_poems.loc[2719,'keywords']

In [None]:
#url="https://www.flickr.com/photos/138072685@N02/albums/72157691244283801"
url = "http://sites.psu.edu/mgeppingerpassionblog/wp-content/uploads/sites/32731/2015/09/roads-diverging.jpg,https://i.ytimg.com/vi/opKg3fyqWt4/hqdefault.jpg"

In [None]:
results = ModelIt(url)
results


In [None]:
results

In [None]:
results[0]['1']

## Test newly revised code

In [20]:
# this file contains the backend to PoeML

def get_path_and_file_names():
    #root_dir = "/home/ubuntu/app_demo"
    root_dir = "/Users/ctoews/Documents/Insight/app_demo"
    api_dir = "/flaskexample/static/api"
    pkl_dir = "/flaskexample/static/pkl"
    api_file = "/MyFirstProject-76680dcd1ad6.json"
    poem_file = "df1_smallpoems.pkl"
    vec_file = "df1_vecs.pkl"
    vectorizer_file = "d1_vectorizer_replacement.pkl"
    return root_dir, api_dir, pkl_dir, api_file, poem_file, vec_file, vectorizer_file


# def get_runtime_parameters():
#     n_matches_per_photo = 3 # maximum number of images to return
#     lam = .1        # regularization parameter
#     batch = True    # use averaging technique to handle multiple images
#     return n_matches_per_photo, lam, batch


def get_pkl_files(root_dir,pkl_dir,poem_file,vec_file,vectorizer_file):
    import pickle
    import pandas as pd
    df_poems = pd.read_pickle(root_dir + pkl_dir + '/' + poem_file)
    df_vecs =   pd.read_pickle(root_dir + pkl_dir + '/' + vec_file)
    vectorizer = pickle.load( open( root_dir + pkl_dir + '/' + vectorizer_file, "rb" ) )
    return df_poems, df_vecs, vectorizer



def get_stopwords():
    from nltk.corpus import stopwords
    import string
    STOPLIST = stopwords.words('english')
    SYMBOLS = " ".join(string.punctuation).split(" ") + \
              ["-----", "--", "---", "...", "“", "”", "'s"] + list(string.digits)
    return STOPLIST, SYMBOLS


def tokenizeText(sample):

    import spacy
    global STOPLIST
    global SYMBOLS

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip()
                      if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    STOPWORDS, SYMBOLS = get_stopwords()

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens



# extract image urls from information in photoset object (returned from Flickr api call)
def assemble_urls(photoset):
    urls = []
    for photo in photoset['photoset']['photo']:
        url = "https://farm" + str(photo['farm']) + ".staticflickr.com/" + photo['server'] + "/" + \
              photo['id'] + "_" + photo['secret'] + ".jpg"
        urls.append(url)
    return urls



# extact userid and albumid from Flickr album url (used to form image urls)
def parse_url(url):

    import re

    try:
        userid = re.search('photos/(.+?)/', url).group(1)
    except AttributeError:
        # AAA, ZZZ not found in the original string
        userid = '' # apply your error handling

    try:
        albumid = re.search('albums/(.*)', url).group(1)
    except AttributeError:
        albumid = '' # apply your error handling

    return userid, albumid


def get_flickr_urls(url):

    import flickrapi

    #import flickr_keys
    api_key = u'37528c980c419716e0879a417ef8211c'
    api_secret = u'41075654a535c203'

    # establish connection
    flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

    # extract user and album id
    userid, albumid = parse_url(url)

    #fetch album info
    albuminfo  = flickr.photosets.getPhotos(user_id=userid,photoset_id=albumid)

    # extract individual photo urls
    photo_urls = assemble_urls(albuminfo)

    return photo_urls



def get_photo_urls(url):
    # input could be a Flickr photo album url
    if 'www.flickr.com/photos/' in url:
        photo_urls = get_flickr_urls(url)

    # or a list of image jpeg urls, or even local filenames
    else:
        photo_urls = url.split(',')

    return photo_urls



# connect to google api
def explicit(root_dir, api_dir, api_file):
    from google.cloud import storage
    # Explicitly use service account credentials by specifying the private key
    # file.
    storage_client = storage.Client.from_service_account_json(
        root_dir + api_dir + '/' + api_file)

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)



def get_labels_for_images(photo_urls, root_dir, api_dir, api_file,image_location):
    import os, io
    from google.cloud import vision
    from google.cloud.vision import types
    import pandas as pd

    # authenticate
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = \
        root_dir+ api_dir + '/' + api_file
    explicit(root_dir, api_dir, api_file)

    # connect to Google api
    client = vision.ImageAnnotatorClient()

    # feed photo url to Google, extract label
    all_labels = []
    for url in photo_urls:
        # different syntax for remote and local images
        if image_location == 'remote':
            image = types.Image()
            image.source.image_uri = url
        elif image_location == 'local':
            # open image file
            with io.open(url, 'rb') as image_file:
                content = image_file.read()
            image = types.Image(content=content)
        else:
            return pd.DataFrame({'keywords':all_labels,'url':photo_urls})

        # get and parse labels
        response = client.label_detection(image=image)
        labels = response.label_annotations
        these_labels = ''
        for label in labels:
            these_labels += (label.description + ' ')
        all_labels.append(these_labels)

    # store labels as dataframe
    df_all_labels = pd.DataFrame({'keywords':all_labels,'url':photo_urls})

    # eliminate any photo that came back with zero labels
    df_all_labels = df_all_labels.loc[df_all_labels.keywords.apply(lambda x: len(x))!=0]

    return df_all_labels



def extract_n_top_words_from_poem(poem_vector,feature_names):
    import numpy as np

    # adjust as necessary
    ntopwords = 10

    # rank keywords by tf-idf weight
    indices = poem_vector.indices
    rank_idx = poem_vector.data.argsort()[:-ntopwords:-1]

    # form list of such words and return it, along with weights
    keywords = [feature_names[indices[i]] for i in rank_idx]
    weights = [poem_vector.data[i] for i in rank_idx]

    return keywords, np.array(weights)


# transform the image labels with the vectorizer
def weight_labels(df_all_labels, vectorizer):
    import spacy
    import pandas as pd

    image_words = []
    image_weights = []
    feature_names = vectorizer.get_feature_names()

    # the vectorizer seems to need to have access to the parser, probably for the tokenizing step
    parser = spacy.load('en')

    for row in vectorizer.transform(df_all_labels['keywords'].tolist()):
        kw, wt = extract_n_top_words_from_poem(row,feature_names)
        image_words.append(kw)
        image_weights.append(wt)

    df_images = df_all_labels
    df_images['keywords'] = image_words
    df_images['weights'] = image_weights

    # eliminate any photo that came back with zero labels
    df_images = df_images.loc[df_images.keywords.apply(lambda x: len(x))!=0]

    return df_images



def images2vec(df_images):
    import spacy
    import pandas as pd
    import numpy as np

    # load parser, to be used with vectorizer
    parser = spacy.load('en')

    image_vectors = np.zeros((len(df_images),384))
    j=0
    for row in df_images.itertuples():
        keywords = row.keywords
        weights = row.weights
        vecs = np.zeros((len(keywords),384))
        i = 0
        for k in keywords:
            vecs[i,:] = parser(k).vector
            i+=1
        image_vectors[j,:]=np.dot(weights,vecs)
        j+=1

    return image_vectors



def find_best_match(image_vectors, poem_vectors, image_sentiment, poem_sentiment,n_matches_per_photo=3,batch=False,lam=0.1,gamma=0.0):
    import numpy as np
    from sklearn.metrics.pairwise import cosine_similarity

    # find poem that maximizes a sentiment-regularized objective function
    if batch:
        image_vectors = np.mean(image_vectors,axis=0).reshape(1,384)
        image_sentiment = [np.mean(image_sentiment)]

    # assess the cosine similarity for each image/poem pair
    sim = cosine_similarity(image_vectors,poem_vectors)

    # also calculate the difference in sentiment score
    dif = np.array([np.abs((im_s - poem_sentiment)) for im_s in image_sentiment])

    # the net score is a weighted difference
    net = sim - lam*dif

    ix = net.argsort(axis=1)[:,:-n_matches_per_photo-1:-1]
    scores = np.array([ list(net[i,ix[i,:]]) for i in range(len(ix))])
    
    return ix, scores



def gather_results(ix,scores,df_images,df_poems,photo_urls):
    import pandas as pd
    import numpy as np
    # gather top N poems (for each picture, or for the "average" picture)
    results = pd.DataFrame([ df_poems.loc[ix[i,:],'poem'].tolist() for i in range(len(ix))],\
                           columns = [str(i) for i in range(1,ix.shape[1]+1)])

    # collect image urls and keywords
    if len(results) == len(df_images):
        results[['url','keywords','weights','sentiment']] = \
        df_images[['url','keywords','weights','sentiment']]

    # if in batchmode, collect images with the most keywords
    else:
        ix = np.argmax(df_images.keywords.apply(lambda x: len(x)))
        results['url']= df_images.loc[ix,'url']
        results['keywords']= [df_images.loc[ix,'keywords']]
        results['weights']=[df_images.loc[ix,'weights']]
        results['sentiment']= df_images.loc[ix,'sentiment']

    return results


def ModelIt(url,image_location='remote', n_matches_per_photo = 3,batch=False,lam=0.1,gamma=0.0):

    from PIL import Image, ImageDraw
    import pandas as pd
    import spacy
    import numpy as np
    from textblob import TextBlob

    # load up path and file names, as well as runtime parameters
    root_dir, api_dir, pkl_dir, api_file, poem_file, vec_file, vectorizer_file =\
        get_path_and_file_names()

    # some of the larger data structures are stored in binary form, to expedite runtime
    df_poems, df_vecs, vectorizer = get_pkl_files(root_dir,pkl_dir,poem_file,vec_file,vectorizer_file)
    poem_vectors = df_vecs.values

    # Set the variable "photo_urls", which is a list of urls of all images
    photo_urls = get_photo_urls(url)

    # Connect to Google-Cloud-Vision API and extract labels for each image
    df_all_labels = get_labels_for_images(photo_urls, root_dir, api_dir, api_file, image_location)

    # weight the keywords by the vectorizer used to process the poetry text
    df_images = weight_labels(df_all_labels, vectorizer)

    # append sentiment analysis for each image
    df_images['sentiment'] = [TextBlob(' '.join(x)).sentiment[0] for x in df_images.keywords]

    # if after extracting and weighting labels, nothing remains, exit gracefully
    if len(df_images)==0:
        return -1

    # otherwise, embed image vectors via word2vec
    image_vectors = images2vec(df_images)

    # return sorted scores
    ix, scores = find_best_match(image_vectors, poem_vectors, df_images['sentiment'], df_poems['sentiment'],batch=batch)

    # gather all relevant info into a dataframe
    results = gather_results(ix,scores,df_images,df_poems,photo_urls)

    #pdb.set_trace()
    # return a dictionary
    return results.to_dict('records')


In [18]:
url = "https://www.flickr.com/photos/138072685@N02/albums/72157691244283801"
results = ModelIt(url)

[<Bucket: toews-images>]


In [25]:
results

[{'1': '"Nature" is what we see -- The Hill -- the Afternoon -- Squirrel -- Eclipse -- the Bumble bee -- Nay -- Nature is Heaven -- Nature is what we hear -- The Bobolink -- the Sea -- Thunder -- the Cricket -- Nay -- Nature is Harmony -- Nature is what we know -- Yet have no art to say -- So impotent Our Wisdom is To her Simplicity.',
  '2': 'Nature assigns the Sun -- That -- is Astronomy -- Nature cannot enact a Friend -- That -- is Astrology.',
  '3': 'My neighbor s daughter has created a city you cannot see on an island to which you cannot swim ruled by a noble princess and her athletic consort all the buildings are glass so that lies are impossible beneath the city they have buried certain words which can never be spoken again chiefly the word divorce which is eaten by maggots when it rains you hear chimes rabbits race through its suburbs the name of the city is one you can almost pronounce',
  'keywords': ['forest', 'nature', 'autumn', 'path', 'tree'],
  'sentiment': 0.0,
  'url'

In [None]:
url = "/Users/ctoews/Documents/Insight/Project/photos/roads-diverging_test.jpg"
#results2 = ModelIt(image_path,image_location='local')
image_location = 'local'

In [24]:
image_location='remote' 
n_matches_per_photo = 3
batch=False
lam=0.1
gamma=0.0

from PIL import Image, ImageDraw
import pandas as pd
import spacy
import numpy as np
from textblob import TextBlob

# load up path and file names, as well as runtime parameters
root_dir, api_dir, pkl_dir, api_file, poem_file, vec_file, vectorizer_file =\
    get_path_and_file_names()

# some of the larger data structures are stored in binary form, to expedite runtime
df_poems, df_vecs, vectorizer = get_pkl_files(root_dir,pkl_dir,poem_file,vec_file,vectorizer_file)
poem_vectors = df_vecs.values

# Set the variable "photo_urls", which is a list of urls of all images
photo_urls = get_photo_urls(url)

# Connect to Google-Cloud-Vision API and extract labels for each image
df_all_labels = get_labels_for_images(photo_urls, root_dir, api_dir, api_file, image_location)

# weight the keywords by the vectorizer used to process the poetry text
df_images = weight_labels(df_all_labels, vectorizer)

# append sentiment analysis for each image
df_images['sentiment'] = [TextBlob(' '.join(x)).sentiment[0] for x in df_images.keywords]

# if after extracting and weighting labels, nothing remains, exit gracefully
if len(df_images)==0:
    return -1

# otherwise, embed image vectors via word2vec
image_vectors = images2vec(df_images)

# return sorted scores
ix, scores = find_best_match(image_vectors, poem_vectors, df_images['sentiment'], df_poems['sentiment'],batch=batch)

# gather all relevant info into a dataframe
results = gather_results(ix,scores,df_images,df_poems,photo_urls)

#pdb.set_trace()
# return a dictionary
return results.to_dict('records')


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))

In [44]:
poem_keys = ['1','2','3']
ix = [[df_poems.loc[df_poems.poem==results[i][key]].index[0] for key in poem_keys] for i in [0,1]]
ix

[[585, 360, 4001], [2719, 3247, 3236]]

In [60]:
df_ex1 = pd.DataFrame(results)

In [76]:
df_ex2_image=df_ex1.loc[1,['keywords','sentiment','weights']]

In [77]:
df_ex2_poems = df_poems.iloc[ix[1],:]

In [78]:
df_ex2_image

keywords                                   [dog, like]
sentiment                                            0
weights      [0.9951580293529281, 0.09828782536202801]
Name: 1, dtype: object

In [79]:
df_ex2_poems

Unnamed: 0,author,poem,sentiment,keywords,weights
2719,Ogden Nash,The truth I do not stretch or shove When I sta...,0.1875,"[dog, test, state, stretch, wet, truth, love]","[0.6147142694012745, 0.3695667852259223, 0.351..."
3247,Rowan Ricardo Phillips,Yesterday s newspaper becomes last week s News...,0.066667,"[dog, wait, week, key, yesterday, spread, soon...","[0.5572529827948403, 0.3109458903811767, 0.221..."
3236,Adrian Matejka,after Wolfgang Amadeus Mozart In the wobbly pi...,0.071875,"[dog, nod, belly, nose, sunlight, knee, living...","[0.39774979938022664, 0.24090738384399552, 0.2..."


In [164]:
#for index, row in df_ex1_poems.iterrows():
    #print(row['weights'])
    #print("{:06.2f}".format(row['weights'])
    
df_ex1_poems.weights = [["{:.2f}".format(num) for num in df_ex1_poems.weights.values[i]] for i in range(len(df_ex1_poems))]

In [165]:
df_ex1_poems

Unnamed: 0,author,poem,sentiment,keywords,weights
585,Emily Dickinson,"""Nature"" is what we see -- The Hill -- the Aft...",0.0,"[nature, cricket, thunder, wisdom, art, aftern...","[0.80, 0.23, 0.22, 0.22, 0.19, 0.19, 0.17, 0.1..."
360,Emily Dickinson,Nature assigns the Sun -- That -- is Astronomy...,0.0,"[nature, friend, sun]","[0.84, 0.44, 0.33]"
4001,Alicia Ostriker,My neighbor s daughter has created a city you ...,0.049206,"[city, word, building, island, create, daughte...","[0.57, 0.29, 0.23, 0.23, 0.23, 0.23, 0.22, 0.2..."


In [205]:
import pickle
df_ex1_image.to_pickle('df_ex1_image.pkl')
df_ex1_poems.to_pickle('df_ex1_poems.pkl')

In [None]:
poem_txtlist = [ df_poems.loc[ix[i,:],'poem'].tolist() for i in range(len(ix))]
poem_txtcols = ['p_'+str(i) for i in range(1,ix.shape[1]+1)]
poem_kwlist = [ df_poems.loc[ix[i,:],'keywords'].tolist() for i in range(len(ix))]
poem_kwcols = ['p_kw_'+str(i) for i in range(1,ix.shape[1]+1)]
poem_wtlist = [ df_poems.loc[ix[i,:],'weights'].tolist() for i in range(len(ix))]
poem_wtcols = ['p_wt_'+str(i) for i in range(1,ix.shape[1]+1)]
poem_sntmtlist = [ df_poems.loc[ix[i,:],'sentiment'].tolist() for i in range(len(ix))]
poem_sntmtcols = ['p_st_'+str(i) for i in range(1,ix.shape[1]+1)]

In [203]:
df_ex1_poems

Unnamed: 0,author,poem,sentiment,keywords,weights
585,Emily Dickinson,"""Nature"" is what we see -- The Hill -- the Aft...",0.0,"[nature, cricket, thunder, wisdom, art, aftern...","[0.80, 0.23, 0.22, 0.22, 0.19, 0.19, 0.17, 0.1..."
360,Emily Dickinson,Nature assigns the Sun -- That -- is Astronomy...,0.0,"[nature, friend, sun]","[0.84, 0.44, 0.33]"
4001,Alicia Ostriker,My neighbor s daughter has created a city you ...,0.05,"[city, word, building, island, create, daughte...","[0.57, 0.29, 0.23, 0.23, 0.23, 0.23, 0.22, 0.2..."


In [168]:
df_ex1_poems

Unnamed: 0,author,poem,sentiment,keywords,weights
585,Emily Dickinson,"""Nature"" is what we see -- The Hill -- the Aft...",0.0,"[nature, cricket, thunder, wisdom, art, aftern...","[0.80, 0.23, 0.22, 0.22, 0.19, 0.19, 0.17, 0.1..."
360,Emily Dickinson,Nature assigns the Sun -- That -- is Astronomy...,0.0,"[nature, friend, sun]","[0.84, 0.44, 0.33]"
4001,Alicia Ostriker,My neighbor s daughter has created a city you ...,0.049206,"[city, word, building, island, create, daughte...","[0.57, 0.29, 0.23, 0.23, 0.23, 0.23, 0.22, 0.2..."


In [187]:
def df_floats_to_strings(df):
    #df.weights = [["{:.2f}".format(num) for num in df.weights.values[i]] \
    #              for i in range(len(df))]
    df.sentiment = ["{:.2f}".format(num) for num in df.sentiment.values]

    return df

In [200]:
#df_floats_to_strings(df_ex1_image)
df_ex1_image.sentiment

0.0

In [161]:
df_ex1_image=pd.read_pickle('df_ex1_image.pkl')
df_ex1_poems=pd.read_pickle('df_ex1_poems.pkl')


In [209]:
for set_ in (df_ex1_poems,df_ex2_poems):
    print(set_)
    print('boo!')

               author                                               poem  \
585   Emily Dickinson  "Nature" is what we see -- The Hill -- the Aft...   
360   Emily Dickinson  Nature assigns the Sun -- That -- is Astronomy...   
4001  Alicia Ostriker  My neighbor s daughter has created a city you ...   

     sentiment                                           keywords  \
585       0.00  [nature, cricket, thunder, wisdom, art, aftern...   
360       0.00                              [nature, friend, sun]   
4001      0.05  [city, word, building, island, create, daughte...   

                                                weights  
585   [0.80, 0.23, 0.22, 0.22, 0.19, 0.19, 0.17, 0.1...  
360                                  [0.84, 0.44, 0.33]  
4001  [0.57, 0.29, 0.23, 0.23, 0.23, 0.23, 0.22, 0.2...  
boo!
                      author  \
2719              Ogden Nash   
3247  Rowan Ricardo Phillips   
3236          Adrian Matejka   

                                                   p

In [211]:
from sklearn.preprocessing import OneHotEncoder

In [214]:
test = np.random?

In [220]:
test = np.random.randint(0,5,size=10)

In [225]:
encoder = OneHotEncoder()
X = encoder.fit(test.reshape(-1,1))

In [227]:
dir(X)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_fit_transform',
 '_get_param_names',
 '_transform',
 'active_features_',
 'categorical_features',
 'dtype',
 'feature_indices_',
 'fit',
 'fit_transform',
 'get_params',
 'handle_unknown',
 'n_values',
 'n_values_',
 'set_params',
 'sparse',
 'transform']

In [248]:
X.transform(test.reshape(-1,1)).todense()

matrix([[0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0.]])

In [259]:
X.active_features_

array([0, 1, 2, 3, 4])