In [458]:
from google.cloud import datastore
import config
import sys,os,os.path
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words

In [459]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='/Users/leoliber/Repos/simoti/ad-server/keyfile.json'
ds = datastore.Client(project=config.PROJECT_ID)

In [460]:
def getStopWords(language):
  ''' Get explicit stop words by lanauge

  Args:
    lanaguage (str): 2 characters language name
  
  Returns:
    lanaguage parameter if not found in lookup
    else returns a list of stop words
  '''

  # Hebrew stop words from: https://github.com/stopwords-iso/stopwords-he
  stopWords = {
    'he': ["אבל","או","אולי","אותה","אותו","אותי","אותך","אותם","אותן","אותנו","אז","אחר","אחרות","אחרי","אחריכן","אחרים","אחרת","אי","איזה","איך","אין","איפה","איתה","איתו","איתי","איתך","איתכם","איתכן","איתם","איתן","איתנו","אך","אל","אלה","אלו","אם","אנחנו","אני","אס","אף","אצל","אשר","את","אתה","אתכם","אתכן","אתם","אתן","באיזומידה","באמצע","באמצעות","בגלל","בין","בלי","במידה","במקוםשבו","ברם","בשביל","בשעהש","בתוך","גם","דרך","הוא","היא","היה","היכן","היתה","היתי","הם","הן","הנה","הסיבהשבגללה","הרי","ואילו","ואת","זאת","זה","זות","יהיה","יוכל","יוכלו","יותרמדי","יכול","יכולה","יכולות","יכולים","יכל","יכלה","יכלו","יש","כאן","כאשר","כולם","כולן","כזה","כי","כיצד","כך","ככה","כל","כלל","כמו","כן","כפי","כש","לא","לאו","לאיזותכלית","לאן","לבין","לה","להיות","להם","להן","לו","לי","לכם","לכן","למה","למטה","למעלה","למקוםשבו","למרות","לנו","לעבר","לעיכן","לפיכך","לפני","מאד","מאחורי","מאיזוסיבה","מאין","מאיפה","מבלי","מבעד","מדוע","מה","מהיכן","מול","מחוץ","מי","מכאן","מכיוון","מלבד","מן","מנין","מסוגל","מעט","מעטים","מעל","מצד","מקוםבו","מתחת","מתי","נגד","נגר","נו","עד","עז","על","עלי","עליה","עליהם","עליהן","עליו","עליך","עליכם","עלינו","עם","עצמה","עצמהם","עצמהן","עצמו","עצמי","עצמם","עצמן","עצמנו","פה","רק","שוב","של","שלה","שלהם","שלהן","שלו","שלי","שלך","שלכה","שלכם","שלכן","שלנו","שם","תהיה","תחת"],
    'en': list(stop_words.ENGLISH_STOP_WORDS)
  }
  return stopWords.get(language, 'english') # return specified lanauge if no not found in lookup

In [461]:
def getArticleById(publisherId, articleId):
  ''' Get article bu id from Datastore

  Args:
    publisherId (int): Publisher ID
    articleId (int): Article ID

  Returns:
    Article entity
  '''
  articleKey = datastore.Key('publishers', publisherId, 'articles', articleId, project=config.PROJECT_ID)
  article = ds.get(key=articleKey)
  return article

In [462]:
def getFrequencyMatrix(article):
    stopWords = getStopWords('en') # Save and get language from article! (not publihser)
    vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                                   lowercase=True,
                                   max_features=None,
                                   stop_words = stopWords)
    TfIdfMatrix = vectorizer.fit_transform(article)
    return (TfIdfMatrix, vectorizer.get_feature_names())

In [463]:
def getSnippets():
  ''' Get all active snippets

  Args:
    None

  Returns:
    Snippets
  '''
  query = ds.query(kind='snippets')
  # query.add_filter('status', '=', 'active') # Doesn't work - WTYF?!!?
  return list(query.fetch())

## Start

In [479]:
# Get article dict
article = getArticleById('martech.zone', 'ecommerce-shipping-options')
(freq, feat) = getFrequencyMatrix([article['content']])
freq = freq.toarray()[0]

feat.append('MaaS ecosystem') # Add a common word
freq = np.append(freq, 1)

articleDict = {feat[i]: freq[i] for i in range(0, min(len(feat), len(freq)))}

In [480]:
# Get snippets dict

In [481]:
def getScoredDicionary(snippets):   
    snippetsDict = {}
    for i in range(0, len(snippets)):
        for j in range(0, len(snippet['wordPouch'])):
            featureName = snippets[i]['wordPouch'][j]
            snippetsDict[featureName] = snippetsDict.get(featureName, [0] * len(snippets))
            snippetsDict[featureName][i] = snippets[i]['wordPouchScores'][j]
    return snippetsDict

In [482]:
snippets = getSnippets()
snippetsDict = getScoredDicionary(snippets)

In [483]:
# Common words between all snippets
commonWords = [dup for dup in snippetsDict if 0 not in snippetsDict[dup]]
print('Common words between all {} snippets:'.format(len(snippets)), commonWords)

Common words between all 2 snippets: ['Africa rose']


In [530]:
# Multiply dictionaries
commonWordsBag = {}
snippetScores = [0] * len(snippets)
for i in range(0, len(snippets)):
    for key in articleDict.keys():
        keyScore = articleDict[key] * snippetsDict.get(key, [0] * len(snippets))[i]
        if(keyScore > 0):
            snippetScores[i] = snippetScores[i] + keyScore
            commonWordsBag[i] = commonWordsBag.get(i, set()) | {key}

[0.32148986304024385, 0]