In [2]:
import os
import pandas as pd
import flask
import jmespath
import pymongo
from flask_classful import FlaskView, route
from flask_jwt_extended import current_user, jwt_required

import amz_local_search.analysis as sd
import amz_local_search.artifacts.asin_report as asin_report_artifact
import amz_local_search.artifacts.search as search_artifact
import bb_utils.time.utils as tu
from amz_local_search.base.action_logger import (ActionContext, ActionError,
                                                 ActionState)
from amz_local_search.base.celery_job import celery_client
from amz_local_search.base.errors import (UserResourceNotFound,
                                          UserUsageLimitExceeded)
from amz_local_search.base.instances import action_logger, artifact_updater
from amz_local_search.config import get_config
from bb_utils.time.utils import utc_time
from web_app_api.extensions import get_user_id_from_jwt
from web_app_api.utils import proxy_to_obj
from bb_utils.utils.mmh3_hash import json_to_hash128_s

In [3]:
def connect_to_mongo_db():
    mongo_uri = get_config().mongo_local_search_db_uri()
    client = pymongo.MongoClient(mongo_uri)
    return client.get_database()


def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z


class Catalog(FlaskView):
    route_prefix = 'api/local-search/query'

    def get_asins_meta(self, mongo_db, asins):
        product_details = dict(
            (p['asin'], p)
            for p in mongo_db.products.find({'_id': {
                '$in': asins
            }}))
        return product_details

    def asin_report_key(self, query, asin):
        return json_to_hash128_s([query, asin])

    def get_asin_reports_from_actions(self, mongo_db, user_id,
                                      scan_depth=10000):
        matches = mongo_db.actions.find(
            {
                'user.id': user_id,
                'action_sig': 'amz_local_search.report(query, asin)',
                'state.success': True,
                'args.asin': {
                    '$nin': ['@latest', '@LATEST']
                }
            }, {
                'args.asin': 1,
                'args.query': 1,
                'occurred_at': 1
            }).sort([('occurred_at', -1)]).limit(scan_depth)

        reports = {}
        for match in matches:
            asin = match['args'].get('asin', '').strip()
            query = match['args'].get('query', '').strip()

            if not asin or asin.startswith('@'):
                continue

            key = self.asin_report_key(query, asin)

            report = reports.get(key)
            if report:
                report['frequency'] = report['frequency'] + 1
            else:
                reports[key] = {
                    'key': key,
                    'query': query,
                    'asin': asin,
                    'timestamp': match['occurred_at'],
                    'frequency': 1
                }
        return reports


In [4]:
mongo_db = connect_to_mongo_db()
user_id = 1

catalog = Catalog()
reports = catalog.get_asin_reports_from_actions(mongo_db, user_id)
reports

{'2KiodrgdzGqYjWRMFJL6Vj': {'asin': u'B07RFL8J76',
  'frequency': 2,
  'key': '2KiodrgdzGqYjWRMFJL6Vj',
  'query': u'carabiner clip',
  'timestamp': datetime.datetime(2019, 11, 1, 18, 59, 15, 915000)},
 '4Krb38NF5JXtqPvV2NDkUC': {'asin': u'B07DD9RQFF',
  'frequency': 1,
  'key': '4Krb38NF5JXtqPvV2NDkUC',
  'query': u'b07dd9rqff',
  'timestamp': datetime.datetime(2019, 10, 17, 5, 24, 28, 518000)},
 '4Y77FXZPmTpnxmHGj3FJjj': {'asin': u'B00T56FQAE',
  'frequency': 1,
  'key': '4Y77FXZPmTpnxmHGj3FJjj',
  'query': u'biotin',
  'timestamp': datetime.datetime(2019, 10, 18, 7, 10, 15, 39000)},
 '55WckJx2QMN3amjM63iwye': {'asin': u'B006SFUDT4',
  'frequency': 2,
  'key': '55WckJx2QMN3amjM63iwye',
  'query': u'bath toy organizer',
  'timestamp': datetime.datetime(2019, 11, 1, 19, 2, 1, 551000)},
 '5GXurd6zjKy3FNCoviXHoj': {'asin': u'B013NNG18U',
  'frequency': 57,
  'key': '5GXurd6zjKy3FNCoviXHoj',
  'query': u'carabiner clip',
  'timestamp': datetime.datetime(2019, 12, 13, 21, 11, 32, 807000)},

In [5]:
reports_df = pd.DataFrame(reports.values())
reports_df.head(3)

Unnamed: 0,asin,frequency,key,query,timestamp
0,B07W1KDNRV,1,JKWywnNPFsSdfq83XtMaHb,outdoor dog fence system,2019-10-16 01:03:21.038
1,B07TV1TRDR,6,WEepzdhygS395iLP5vsgXN,carabiner clip,2019-11-27 19:24:36.278
2,B0001ZWZ8O,1,Z7aNboQ8DL9V43FyoxnaLo,outdoor dog fence system,2019-10-19 08:19:42.704


In [6]:
reports_df.groupby(['query']).apply(lambda g: g.sort_values(by=['frequency', 'timestamp'], ascending=[False, True]).to_dict(orient='records')).to_dict()

{u'b07dd9rqff': [{'asin': u'B07DD9RQFF',
   'frequency': 1,
   'key': '4Krb38NF5JXtqPvV2NDkUC',
   'query': u'b07dd9rqff',
   'timestamp': Timestamp('2019-10-17 05:24:28.518000')}],
 u'bath toy organizer': [{'asin': u'B01M6ZFU02',
   'frequency': 9,
   'key': 'B8LnPiTUYEaLsYxLYHrQxL',
   'query': u'bath toy organizer',
   'timestamp': Timestamp('2019-10-16 03:00:49.222000')},
  {'asin': u'B077YNNT54',
   'frequency': 3,
   'key': 'tWT2D2wnPbDr6g2Frwo2LB',
   'query': u'bath toy organizer',
   'timestamp': Timestamp('2019-10-16 02:57:31.588000')},
  {'asin': u'B01DAPMQO4',
   'frequency': 3,
   'key': 'brSeE5xBXw8gEV2PQfAUnX',
   'query': u'bath toy organizer',
   'timestamp': Timestamp('2019-11-01 19:02:11.320000')},
  {'asin': u'B000MVQBG4',
   'frequency': 3,
   'key': 'B9j9SSSg4nDswY9w74GN5e',
   'query': u'bath toy organizer',
   'timestamp': Timestamp('2019-11-01 19:02:18.533000')},
  {'asin': u'B006SFUDT4',
   'frequency': 2,
   'key': '55WckJx2QMN3amjM63iwye',
   'query': u'bath

In [7]:
#reports_df.groupby(['query']).agg({'frequency': 'sum', 'timestamp': 'last'}).reset_index().to_dict(orient='records')

reports_df.groupby(['asin']).agg({'frequency': 'sum', 'timestamp': 'last', 'query': [('size', 'count')]}).reset_index().sort_values(by=['timestamp'], ascending=[False]).to_dict(orient='records')

ValueError: The column label 'timestamp' is not unique.
For a multi-index, the label must be a tuple with elements corresponding to each level.

In [None]:
SAMPLE_QUERY_WHITELIST = ['carabiner clip']

def get_searched_queries(mongo_db, user_id):
    utc_time_now = utc_time()

    cursor = search_artifact.coll(mongo_db).find({
        'user_id':
        user_id,
        'type_id':
        search_artifact.TYPE_ID
    }).sort([('modified_at', -1)])

    results = []
    queries_in_results = set()

    for item in cursor:
        artifact_updater.apply_timeouts_to_state(item, utc_time_now)
        record = {
            'query': item['key']['query'] if 'key' in item else item['instance_id'],            
            'created_at': item['created_at'],
            'modified_at': item['modified_at'],
            'action_id': item['update']['action_id'],
            'action_done': item['update']['done'],
            'action_success': item['update'].get('success'),
            'updates_succeeded': item['stat'].get('updates_succeeded') if 'stat' in item else None,
            'avg_state_var': item['meta'].get('avg_state_var') if 'meta' in item else None            
        }
        results.append(record)
        queries_in_results.add(record['query'])

    injected_results = []
    for sample_query in SAMPLE_QUERY_WHITELIST:
        if sample_query not in queries_in_results:
            injected_results.append({
                'is_sample': True,
                'query': sample_query,
                'action_done': True,
                'action_success': True
            })

    return injected_results + results

get_searched_queries(mongo_db, user_id)

In [8]:
import flask
import pandas as pd
import pymongo
from flask_classful import FlaskView
from flask_jwt_extended import jwt_required

import amz_local_search.artifacts.search as search_artifact
from amz_local_search.base.instances import artifact_updater
from amz_local_search.config import get_config
from bb_utils.time.utils import utc_time
from bb_utils.utils.mmh3_hash import json_to_hash128_s
from web_app_api.extensions import get_user_id_from_jwt


def connect_to_mongo_db():
    mongo_uri = get_config().mongo_local_search_db_uri()
    client = pymongo.MongoClient(mongo_uri)
    return client.get_database()


SAMPLE_QUERY_WHITELIST = ['carabiner clip']


class Catalog(FlaskView):
    route_prefix = 'api/local-search/query'

    def get_asins_meta(self, mongo_db, asins):
        product_details = dict(
            (p['asin'], p)
            for p in mongo_db.products.find({'_id': {
                '$in': asins
            }}))
        return product_details

    def asin_report_key(self, query, asin):
        return json_to_hash128_s([query, asin])

    def get_asin_reports_from_actions(self, mongo_db, user_id,
                                      scan_depth=1000):
        matches = mongo_db.actions.find(
            {
                'user.id': user_id,
                'action_sig': 'amz_local_search.report(query, asin)',
                'state.success': True,
                'args.asin': {
                    '$nin': ['@latest', '@LATEST']
                }
            }, {
                'args.asin': 1,
                'args.query': 1,
                'occurred_at': 1
            }).sort([('occurred_at', -1)]).limit(scan_depth)

        reports = {}
        for match in matches:
            asin = match['args'].get('asin', '').strip()
            query = match['args'].get('query', '').strip()

            if not asin or asin.startswith('@'):
                continue

            key = self.asin_report_key(query, asin)

            report = reports.get(key)
            if report:
                report['frequency'] = report['frequency'] + 1
            else:
                reports[key] = {
                    'key': key,
                    'query': query,
                    'asin': asin,
                    'timestamp': match['occurred_at'],
                    'frequency': 1
                }
        return reports

    def get_searched_queries_from_artifacts(self, mongo_db, user_id):
        utc_time_now = utc_time()

        cursor = search_artifact.coll(mongo_db).find({
            'user_id':
            user_id,
            'type_id':
            search_artifact.TYPE_ID
        }).sort([('modified_at', -1)])

        results = []
        queries_in_results = set()

        for item in cursor:
            artifact_updater.apply_timeouts_to_state(item, utc_time_now)
            record = {
                'query':
                item['key']['query'] if 'key' in item else item['instance_id'],
                'created_at':
                item['created_at'],
                'modified_at':
                item['modified_at'],
                'action_id':
                item['update']['action_id'],
                'action_done':
                item['update']['done'],
                'action_success':
                item['update'].get('success'),
                'updates_succeeded':
                item['stat'].get('updates_succeeded')
                if 'stat' in item else None,
                'avg_state_var':
                item['meta'].get('avg_state_var') if 'meta' in item else None
            }
            results.append(record)
            queries_in_results.add(record['query'])

        injected_results = []
        for sample_query in SAMPLE_QUERY_WHITELIST:
            if sample_query not in queries_in_results:
                injected_results.append({
                    'is_sample': True,
                    'query': sample_query,
                    'action_done': True,
                    'action_success': True
                })

        return injected_results + results

    def get_view_for_asins(self, asin_reports_df):
        result = {}

        by_asin = asin_reports_df.groupby(['asin'])

        def group_to_sorted_records(g):
            sorted_g = g.sort_values(by=['timestamp', 'frequency'],
                                     ascending=[False, True])
            return sorted_g.to_dict(orient='records')

        result['groups'] = by_asin.apply(group_to_sorted_records).to_dict()

        agg_asins = by_asin.agg({
            'frequency': 'sum',
            'timestamp': 'last',
            'query': 'count'
        })
        result['keys'] = agg_asins.reset_index().sort_values(
            by=['timestamp'], ascending=[False]).to_dict(orient='records')

        return result

    def get_view_for_queries(self, mongo_db, user_id, asin_reports_df):
        result = {}

        by_query = asin_reports_df.groupby(['query'])

        def group_to_sorted_records(g):
            sorted_g = g.sort_values(by=['timestamp', 'frequency'],
                                     ascending=[False, True])
            return sorted_g.to_dict(orient='records')

        result['groups'] = by_query.apply(group_to_sorted_records).to_dict()

        result['keys'] = self.get_searched_queries_from_artifacts(mongo_db, user_id)

        return result


    def index(self):
        user_id = 1

        result = {}

        mongo_db = connect_to_mongo_db()

        asin_reports_dict = self.get_asin_reports_from_actions(
            mongo_db, user_id)
        asin_reports_df = pd.DataFrame(asin_reports_dict.values())

        result['by_asins'] = self.get_view_for_asins(asin_reports_df)

        result['by_queries'] = self.get_view_for_queries(
            mongo_db, user_id, asin_reports_df)
        return result

Catalog().index()

{'by_asins': {'groups': {u'0071357807': [{'asin': u'0071357807',
     'frequency': 2,
     'key': 'pbVDgQKZixxPKi9DQUZVcj',
     'query': u'outdoor dog fence system',
     'timestamp': Timestamp('2019-12-03 07:08:28.939000')}],
   u'0071774351': [{'asin': u'0071774351',
     'frequency': 1,
     'key': 'ZWqCPVYdij5U6PqQiBcy82',
     'query': u'outdoor dog fence system',
     'timestamp': Timestamp('2019-12-03 06:52:51.296000')}],
   u'0525568077': [{'asin': u'0525568077',
     'frequency': 1,
     'key': 'G8VFU9QePM98hf7ijhFeCn',
     'query': u'carabiner clip',
     'timestamp': Timestamp('2019-10-16 03:05:27.938000')}],
   u'0692927999': [{'asin': u'0692927999',
     'frequency': 1,
     'key': 'L4grc8oZwu2AhJy2o3gHWa',
     'query': u'coloring book',
     'timestamp': Timestamp('2019-10-11 04:22:45.452000')}],
   u'1945710799': [{'asin': u'1945710799',
     'frequency': 5,
     'key': 'zdi93LeGfPKAaQSoYs8PkV',
     'query': u'coloring book',
     'timestamp': Timestamp('2019-10-16 0

In [13]:
reports_df['asin'].to_list()

[u'B07W1KDNRV',
 u'B07TV1TRDR',
 u'B0001ZWZ8O',
 u'1945710799',
 u'B07CWKGZYL',
 u'B077YNNT54',
 u'B07VH4GK85',
 u'XYZ',
 u'B07RFD9P55',
 u'B0719B4LNH',
 u'B07TT353XQ',
 u'B000MVQBG4',
 u'B013NNG18U',
 u'B07DNHV7X9',
 u'UNDEFINED',
 u'B00IOZWC2M',
 u'B07JK9DYNR',
 u'B07MJ7N6VW',
 u'B01DAPMQO4',
 u'B07DD9RQFF',
 u'B00T56FQAE',
 u'UNDEFINED',
 u'B07X9962NK',
 u'0525568077',
 u'0071774351',
 u'1999896963',
 u'B072NCYVQK',
 u'B009SZXM4E',
 u'B07GQV8H5V',
 u'B07RMKR1GB',
 u'B016MAVG86',
 u'B00NPLSZF8',
 u'B076ZVX8TM',
 u'0692927999',
 u'B07SZ4VP91',
 u'B01C5QSENQ',
 u'B07KT6BM9Y',
 u'B073DXLPZ9',
 u'B01M6ZFU02',
 u'0071357807',
 u'B00ZTNK8FA',
 u'B07JJZTYSV',
 u'B006SFUDT4',
 u'B018L2WM86',
 u'B073GJ59QM',
 u'B07DD9RQFF',
 u'B06W9NCZYX',
 u'B07QH1YW88',
 u'B07VC529JB',
 u'B07RFL8J76']