In [2]:
import pymongo
import json
import pandas as pd
from pprint import pprint
import bb_utils.time.utils as tu
from bb_utils.io.utils import open_maybe_compressed
from amz_scrapy.spiders.amazon.search_api_spider import JsonResultsParser
from bson.objectid import ObjectId

In [3]:
LOCAL_MONGO_URI = 'mongodb://amzjet:^Y7zanY9#N_b9M7kALfdeoP1@localhost:30010/%s?authSource=admin'
SEARCH_FILE_NAME = '/Users/evgenyp/Projects/AmzJet/AmzJet/amz-apps/amz_local_search/resources/amazon/search-carabiner+clip-3p-3s.json.gz'

In [6]:
def convert_search_page_data(crawler_item, product_details_map = None):
    result_ts_utc = tu.utc_time(crawler_item['crawl_date'])

    payload = crawler_item['payload']

    req_params = payload['request']

    query = req_params['args']['query']
    ip_location = req_params['args']['location']
    max_pages_per_session = req_params['args']['max_pages']
    max_items_on_page = req_params['args']['max_page_results']

    page_num = req_params['search_page_num']
    session_id = req_params['proxy_session_id']

    search_data = JsonResultsParser.parse(payload['data'])

    asins_data = search_data['asin_details']

    ranks_data = search_data['search_results']
    first_item_rank = ranks_data['first_item_rank']
    total_item_count = ranks_data['total_results']
    page_count = ranks_data['page_count']

    asins = []
    rank_details_list = []
    product_details_list = []
    for item in ranks_data['results']:
        asin = item['asin']
        asins.append(asin)

        product_details = asins_data[asin]

        rank_details_list.append({
            'get_by': product_details.get('get_it_by_date'),
            'avail_msg': product_details.get('availability_message')
        })
        
        # Product attributes are returned in a separate collection that
        # can be used for display purposes.
        if product_details_map and not asin in product_details_map:
            product_details_map[asin] = {
                'asin': asin,
                'title': product_details.get('image_url'),
                'title': product_details.get('title')
            }

    search_page = {
        'args': {
            'query': query,
            'ip_loc': {
                'country': ip_location['country'],
                'state': ip_location['state']
            },
            'page_num': page_num,
            'session_id': session_id,
            'max_pages': max_pages_per_session,
            'max_items': max_items_on_page
        },
        'result': {
            'meta': {
                'time': result_ts_utc,
                'date': tu.to_midnight(result_ts_utc),
                'first_rank': first_item_rank,
                'total_matches': total_item_count,
                'total_pages': page_count,
                'num_items': len(asins)
            },
            'asins': asins,
            'details': rank_details_list
        },
    }
    return search_page,


def parse_paginated_search_results_from_file(fname, product_details_map=None):
    with open_maybe_compressed(fname) as file:
        for line in file:
            crawler_item = json.loads(line)
            search_page = convert_search_page_data(crawler_item, product_details_map)
            yield search_page

search_pages = list(parse_paginated_search_results_from_file(SEARCH_FILE_NAME))
print search_pages[0]

({'args': {'ip_loc': {'country': u'US', 'state': u'CO'}, 'page_num': 1, 'max_pages': 3, 'max_items': 48, 'session_id': u'5721be00b69c4631b87034e4a03db53b', 'query': u'carabiner clip'}, 'result': {'asins': [u'B0719B4LNH', u'B073DXLPZ9', u'B07CWKGZYL', u'B07DQFQG26', u'B07DQFQG26', u'B01MTQ4ZFH', u'B074JC8FHN', u'B01GJ4EGKU', u'B07715VC1T', u'B072NCYVQK', u'B0743BM3Q1', u'B06XD3H1FM', u'B07QSHRCKP', u'B07KTZ46FL', u'B01GA3N9MQ', u'B001DZNZZI', u'B074PRGQH6', u'B07DD9RQFF', u'B07C8G5HZM', u'B0725JFXCW', u'B01GA1TYHC', u'B07H2N3P9W', u'B07FLHH648', u'B077695Q9D', u'B07S7GVKRZ', u'B07S2J8ZB1', u'B07QH1YW88', u'B075R6Y9QH', u'B01L6DMUKC', u'B07NRFQYCF', u'B07KYHWKMZ', u'B016CMFESW', u'B07TV1TRDR', u'B07Q85RB26', u'B07BKSC8CF', u'B07PZ3GNGM', u'B07BPVFPYS', u'B07JLQ616L', u'B07238R98K', u'B01H1SK1SO', u'B07K7H917R', u'B07P6LK19C', u'B01M11GO6C', u'B07RYS9Z9P', u'B078CYFH5K', u'B07NYS4FMY', u'B073YFCH4J', u'B06XW3B9HB', u'B0758LHH52'], 'meta': {'num_items': 49, 'total_matches': 12031, 'total_p

In [51]:
# def create_search_task(user, args):
#     task_data = mongo_db.tasks.insert_one({
#             'args': args,
#             'user': {
#                 'id': user.get('id'),
#                 'trace': user.get('trace')
#             },
#             'created_at': tu.utc_time(),
#             'started_at': None,
#             'finished_at': None,
#         })

In [27]:
def get_mongo_db():
    mongo_uri = LOCAL_MONGO_URI % 'amz_local_search'
    client = pymongo.MongoClient(mongo_uri)
    return client.get_database()


def save_search_pages(mongo_db, norm_query, time_utc, search_pages):    
    tracker_id = ObjectId()

    for page in search_pages:
        page.pop('_id', None)
        page['args']['parent_id'] = tracker_id
    mongo_db.search_pages.insert_many(search_pages)
    
    tracker_record = mongo_db.search_tracker.insert_one({
        '_id': tracker_id,
        'query': norm_query,
        'time': time_utc,
        'date': tu.to_midnight(time_utc)
    })

    
mongo_db = get_mongo_db()
save_search_pages(mongo_db, 'carabiner clip', tu.utc_time(), search_pages)

In [28]:
def try_find_search_pages(mongo_db, norm_query, time_utc):
    date_utc = tu.to_midnight(time_utc)
    pages = mongo_db.search_pages.find({'args.query': norm_query, 'result.meta.date': date_utc})
    return pages

mongo_db = get_mongo_db()

cur = try_find_search_pages(mongo_db, 'carabiner clip', tu.utc_time())
len(list(cur))

600

In [24]:
def make_dataframe(search_pages, asins):
    asins_to_match = set(asins)
    state_to_matches = {}

    # NOTE: This will not return any rows, when search results are missing for a certain
    # state, for example, due to proxy issues. One of the options is to interpolate results
    # from other states.
    def generate_rows():
        for page in search_pages:
            args = page['args']
            state = args['ip_loc']['state']

            # Prepare data to track missing ASINs.
            #

            res_meta = page['result']['meta']

            total_items = res_meta['total_matches']
            num_items = res_meta['num_items']

            first_rank = res_meta['first_rank']
            last_rank = first_rank + num_items - 1

            state_matches = state_to_matches.get(state, {})
            state_to_matches[state] = state_matches

            state_matches['last_seen_rank'] = max(state_matches.get('last_seen_rank', last_rank), last_rank)
            state_matches['total_items'] = total_items
            matched_asins = state_matches.get('asins', set())

            # Enumerate ASINs on the page.
            for asin_index, asin in enumerate(page['result']['asins']):
                if asin in asins_to_match:
                    matched_asins.add(asin)

                    asin_details = page['result']['details'][asin_index]

                    # NOTE: It was observed that Amazon JSON search API can return
                    # +1 item over requested and that first_rank item on the next page
                    # does not account for that extra item. We ignore that issue here.
                    pos = first_rank + asin_index
                    yield [asin, state, pos, total_items, asin_details.get('get_by'), asin_details.get('avail_msg')]

        # NOTE: This assumes that pages were crawled in order.
        for state, state_matches in state_to_matches.items():
            matched_asins = state_matches.get('asins', [])
            for asin in asins:
                if asin not in matched_asins:
                    yield [
                        asin, state, -(state_matches['last_seen_rank'] + 1), state_matches['total_items'], None, None
                    ]

    headers = ['asin', 'state', 'pos', 'total', 'get_by', 'avail_msg']
    df = pd.DataFrame(generate_rows(), columns=headers)
    return df

pos_df = make_dataframe(search_pages, ['B0719B4LNH', 'B07P2VF8DN', 'SOMEASIN'])
pos_df

Unnamed: 0,asin,state,pos,total,get_by,avail_msg
0,B0719B4LNH,CO,1,12031,2019-08-22,
1,B0719B4LNH,NJ,1,14402,2019-08-22,
2,B0719B4LNH,NY,1,14816,2019-08-22,
3,B0719B4LNH,TN,1,14814,2019-08-22,
4,B0719B4LNH,MA,1,14814,2019-08-22,
5,B0719B4LNH,GA,1,14402,2019-08-22,
6,B0719B4LNH,WV,1,12031,2019-08-22,
7,B0719B4LNH,TN,1,14816,2019-08-22,
8,B0719B4LNH,CT,1,14402,2019-08-22,
9,B0719B4LNH,NH,1,12031,2019-08-22,


In [25]:
asin_df = pos_df[pos_df['pos'] > 0].groupby(['asin']).agg({'pos': ['min', 'max', 'mean', 'std'], 'get_by': ['min'], 'total': ['min']})
asin_df.columns = ['_'.join(col).strip() for col in asin_df.columns.values]
asin_df

Unnamed: 0_level_0,total_min,pos_min,pos_max,pos_mean,pos_std,get_by_min
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B0719B4LNH,12023,1,12,1.114583,1.122683,2019-08-21
B07P2VF8DN,14814,84,132,117.98913,6.439054,NaT


In [43]:
hits_df = pos_df[pos_df.pos > 0].groupby(['asin'])
misses_df = pos_df[pos_df.pos < 0].groupby(['asin'])

#df.drop_duplicates(subset=['A', 'B’], take_last=True, inplace=True)

asins_df = hits_df.agg({'pos': ['min', 'max', 'mean', 'std'], 'get_by': ['min'], 'total': ['min']})
asins_df.columns = ['_'.join(col).strip() for col in asins_df.columns.values]
asins_df

Unnamed: 0_level_0,total_min,pos_min,pos_max,pos_mean,pos_std,get_by_min
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B0719B4LNH,12023,1,12,1.114583,1.122683,2019-08-21
B07P2VF8DN,14814,84,132,117.98913,6.439054,NaT


In [44]:
misses_df = pos_df[pos_df.pos < 0].groupby( ['asin']).head(1).groupby([ 'asin'])
misses_df 

asins_df2 = misses_df.agg({'pos': ['min', 'max', 'mean', 'std'], 'get_by': ['min'], 'total': ['min']})
asins_df2.columns = ['_'.join(col).strip() for col in asins_df2.columns.values]
asins_df2

Unnamed: 0_level_0,total_min,pos_min,pos_max,pos_mean,pos_std,get_by_min
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B0719B4LNH,54239,-145,-145,-145,,NaT
B07P2VF8DN,54239,-145,-145,-145,,NaT
SOMEASIN,54239,-145,-145,-145,,NaT


In [45]:
print len(hits_df)
print len(misses_df)

2
3


In [50]:
misses_df.head(5)

Unnamed: 0,asin,state,pos,total,get_by,avail_msg
188,B0719B4LNH,WA,-145,54239,NaT,
189,B07P2VF8DN,WA,-145,54239,NaT,
190,SOMEASIN,WA,-145,54239,NaT,


In [51]:
pd.concat([asins_df, asins_df2]).drop_duplicates().head(10)

Unnamed: 0_level_0,total_min,pos_min,pos_max,pos_mean,pos_std,get_by_min
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B0719B4LNH,12023,1,12,1.114583,1.122683,2019-08-21
B07P2VF8DN,14814,84,132,117.98913,6.439054,NaT
B0719B4LNH,54239,-145,-145,-145.0,,NaT
