In [458]:
from google.cloud import datastore
import config
import sys,os,os.path
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words

In [459]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='/Users/leoliber/Repos/simoti/ad-server/keyfile.json'
ds = datastore.Client(project=config.PROJECT_ID)

In [460]:
def getStopWords(language):
  ''' Get explicit stop words by lanauge

  Args:
    lanaguage (str): 2 characters language name
  
  Returns:
    lanaguage parameter if not found in lookup
    else returns a list of stop words
  '''

  # Hebrew stop words from: https://github.com/stopwords-iso/stopwords-he
  stopWords = {
    'he': ["אבל","או","אולי","אותה","אותו","אותי","אותך","אותם","אותן","אותנו","אז","אחר","אחרות","אחרי","אחריכן","אחרים","אחרת","אי","איזה","איך","אין","איפה","איתה","איתו","איתי","איתך","איתכם","איתכן","איתם","איתן","איתנו","אך","אל","אלה","אלו","אם","אנחנו","אני","אס","אף","אצל","אשר","את","אתה","אתכם","אתכן","אתם","אתן","באיזומידה","באמצע","באמצעות","בגלל","בין","בלי","במידה","במקוםשבו","ברם","בשביל","בשעהש","בתוך","גם","דרך","הוא","היא","היה","היכן","היתה","היתי","הם","הן","הנה","הסיבהשבגללה","הרי","ואילו","ואת","זאת","זה","זות","יהיה","יוכל","יוכלו","יותרמדי","יכול","יכולה","יכולות","יכולים","יכל","יכלה","יכלו","יש","כאן","כאשר","כולם","כולן","כזה","כי","כיצד","כך","ככה","כל","כלל","כמו","כן","כפי","כש","לא","לאו","לאיזותכלית","לאן","לבין","לה","להיות","להם","להן","לו","לי","לכם","לכן","למה","למטה","למעלה","למקוםשבו","למרות","לנו","לעבר","לעיכן","לפיכך","לפני","מאד","מאחורי","מאיזוסיבה","מאין","מאיפה","מבלי","מבעד","מדוע","מה","מהיכן","מול","מחוץ","מי","מכאן","מכיוון","מלבד","מן","מנין","מסוגל","מעט","מעטים","מעל","מצד","מקוםבו","מתחת","מתי","נגד","נגר","נו","עד","עז","על","עלי","עליה","עליהם","עליהן","עליו","עליך","עליכם","עלינו","עם","עצמה","עצמהם","עצמהן","עצמו","עצמי","עצמם","עצמן","עצמנו","פה","רק","שוב","של","שלה","שלהם","שלהן","שלו","שלי","שלך","שלכה","שלכם","שלכן","שלנו","שם","תהיה","תחת"],
    'en': list(stop_words.ENGLISH_STOP_WORDS)
  }
  return stopWords.get(language, 'english') # return specified lanauge if no not found in lookup

In [461]:
def getArticleById(publisherId, articleId):
  ''' Get article bu id from Datastore

  Args:
    publisherId (int): Publisher ID
    articleId (int): Article ID

  Returns:
    Article entity
  '''
  articleKey = datastore.Key('publishers', publisherId, 'articles', articleId, project=config.PROJECT_ID)
  article = ds.get(key=articleKey)
  return article

In [462]:
def getFrequencyMatrix(article):
    stopWords = getStopWords('en') # Save and get language from article! (not publihser)
    vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                                   lowercase=True,
                                   max_features=None,
                                   stop_words = stopWords)
    TfIdfMatrix = vectorizer.fit_transform(article)
    return (TfIdfMatrix, vectorizer.get_feature_names())

In [463]:
def getSnippets():
  ''' Get all active snippets

  Args:
    None

  Returns:
    Snippets
  '''
  query = ds.query(kind='snippets')
  # query.add_filter('status', '=', 'active') # Doesn't work - WTYF?!!?
  return list(query.fetch())

## Start

In [564]:
# Get article dict
article = getArticleById('martech.zone', 'randy-stocklin-ecommerce')
(freq, feat) = getFrequencyMatrix([article['content']])
freq = freq.toarray()[0]
articleDict = {feat[i]: freq[i] for i in range(0, min(len(feat), len(freq)))}

In [565]:
# Get snippets dict

In [566]:
def getScoredDicionary(snippets):   
    snippetsDict = {}
    for i in range(0, len(snippets)):
        for j in range(0, len(snippet['wordPouch'])):
            featureName = snippets[i]['wordPouch'][j]
            snippetsDict[featureName] = snippetsDict.get(featureName, [0] * len(snippets))
            snippetsDict[featureName][i] = snippets[i]['wordPouchScores'][j]
    return snippetsDict

In [567]:
snippets = getSnippets()
snippetsDict = getScoredDicionary(snippets)

In [568]:
# Common words between all snippets
commonWords = [dup for dup in snippetsDict if 0 not in snippetsDict[dup]]
print('Common words between all {} snippets:'.format(len(snippets)), commonWords)

Common words between all 2 snippets: ['Africa rose']


In [569]:
# Multiply dictionaries
#commonWordsBag = {}
#snippetScores = [0] * len(snippets)
scoredSnippets = { }
for i in range(0, len(snippets)):
    snippetId = snippets[i].key.id
    for key in articleDict.keys():
        keyScore = articleDict[key] * snippetsDict.get(key, [0] * len(snippets))[i]
        if(keyScore > 0):
            scoredSnippets[snippetId] = scoredSnippets.get(snippetId, {})
            scoredSnippets[snippetId]['score'] = scoredSnippets[snippetId].get('score', 0) + keyScore
            scoredSnippets[snippetId]['commonWords'] = scoredSnippets[snippetId].get('commonWords', set()) | {key}
            #snippetScores[i] = snippetScores[i] + keyScore
            #commonWordsBag[i] = commonWordsBag.get(i, set()) | {key}


In [570]:
scoredSnippets

{5682617542246400: {'commonWords': {'customer experience',
   'online shopping experience',
   'shopping experience',
   'store experience'},
  'score': 0.06903143514241572}}

In [571]:
articleDict

{'14': 0.031450273186121945,
 '14 power': 0.031450273186121945,
 '14 power options': 0.031450273186121945,
 '2017': 0.031450273186121945,
 '2017 abm': 0.031450273186121945,
 '2017 abm agencies': 0.031450273186121945,
 '600': 0.031450273186121945,
 '600 unique': 0.031450273186121945,
 '600 unique styles': 0.031450273186121945,
 'abm': 0.031450273186121945,
 'abm agencies': 0.031450273186121945,
 'abm agencies clients': 0.031450273186121945,
 'accelerate': 0.031450273186121945,
 'accelerate buyer': 0.031450273186121945,
 'accelerate buyer journey': 0.031450273186121945,
 'advocates': 0.031450273186121945,
 'advocates download': 0.031450273186121945,
 'advocates download score': 0.031450273186121945,
 'affordable': 0.094350819558365842,
 'affordable shades': 0.031450273186121945,
 'affordable shades online': 0.031450273186121945,
 'affordable straightforward': 0.031450273186121945,
 'affordable straightforward online': 0.031450273186121945,
 'affordable way': 0.031450273186121945,
 'affor

In [563]:
snippetsDict

{'10 percent': [0.8916361500063442, 0],
 '10 year': [0, 0.47880911201148124],
 '12\n\nFuture MaaS': [0.3486828317327879, 0],
 '13 percent': [0.8916361500063442, 0],
 '15 markets': [1.0, 0],
 '18 percent': [0.8916361500063442, 0],
 '2 percent': [0.8916361500063442, 0],
 '20\n\nMaking MaaS': [0.3486828317327879, 0],
 '20 percent': [0.8916361500063442, 0],
 '20 year': [0, 0.47880911201148124],
 '200 Africa': [0.5448416756727376, 0],
 '21 percent': [0.8916361500063442, 0],
 '23 percent': [0.8916361500063442, 0],
 '24 percent': [0.8916361500063442, 0],
 '25 percent': [0.8916361500063442, 0],
 '26 percent': [0.8916361500063442, 0],
 '27 percent': [0.8916361500063442, 0],
 '28 percent': [0.8916361500063442, 0],
 '28 year': [0, 0.47880911201148124],
 '3 percent': [0.8916361500063442, 0],
 '30 percent': [0.8916361500063442, 0],
 '33 percent': [0.8916361500063442, 0],
 '34 percent': [0.8916361500063442, 0],
 '35 percent': [0.8916361500063442, 0],
 '38 percent': [0.8916361500063442, 0],
 '3D imag

# Snippet Location finder

In [None]:
article = ''