In [4]:
import json

import pymongo
from bson.objectid import ObjectId

import bb_utils.time.utils as tu
from amz_scrapy.spiders.amazon.search_api_spider import JsonResultsParser
from bb_utils.io.utils import open_maybe_compressed

In [5]:
WEB_RESULTS_FNAME = '/Users/evgenyp/Projects/AmzJet/AmzJet/__staging__/scrapy-launchers/local-local-search-web.json'

# Converter for web results

In [14]:
def norm_query(query):
    return ' '.join(query.lower().split())

def convert_search_page_data(crawler_item, product_details_map=None):
    payload = crawler_item['payload']

    req_params = payload['request']
    first_item_rank = req_params['first_item_rank']

    search_meta = payload['response']['meta']      
    products = payload['response']['products']
    
    asins = []
    sponsored_asins = []
    
    rank_details_list = []
    
    for product in products:
        asin = product['asin']
        
        if product.get('is_sponsored'):
            sponsored_asins.append(asin)
        else:
            asins.append(asin)            


        # Enable once available:
        # rank_details_list.append({
        #    'get_by':
        #    product_details.get('get_it_by_date'),
        #    'avail_msg':
        #    product_details.get('availability_message')
        # })

        # Product attributes are returned in a separate collection that
        # can be used for display purposes.
        if product_details_map is not None and asin not in product_details_map:
            product_details_map[asin] = {
                'asin': asin,
                'title': product.get('title'),
                'image_url': product.get('image'),
            }

    result_ts_utc = tu.utc_time(crawler_item['crawl_date'])
    ip_location = req_params['args']['location']
    
    search_page = {
        'args': {
            'query': req_params['args']['query'],
            'ip_loc': {
                'country': ip_location['country'],
                'state': ip_location['state']
            },
            'page_num': req_params['search_page_num'],
            'session_id': req_params['proxy_session_id'],
            'max_pages': req_params['args']['max_pages']
        },
        'result': {
            'meta': {
                'time': result_ts_utc,
                'date': tu.to_midnight(result_ts_utc),
                'first_rank': first_item_rank,
                'total_matches': search_meta['total_results'],
                'internal': search_meta['internal'],
                'parser': search_meta['parser']
            },
            'asins': asins,
            'sponsored_asins': sponsored_asins,
            'details': rank_details_list
        },
    }
    return search_page


def parse_paginated_search_results_from_file(fname, product_details_map=None):
    with open_maybe_compressed(fname) as file:
        for line in file:
            crawler_item = json.loads(line)
            search_page = convert_search_page_data(crawler_item,
                                                   product_details_map)
            yield search_page
            
product_details_map = {}
pages = list(parse_paginated_search_results_from_file(WEB_RESULTS_FNAME, product_details_map))
pages

[{'args': {'ip_loc': {'country': u'US', 'state': u'NY'},
   'max_pages': 3,
   'page_num': 1,
   'query': u'biotin',
   'session_id': u'e2487d0833664af397ed60993197eed8'},
  'result': {'asins': [u'B00IOZWC2M',
    u'B009SZXM4E',
    u'B00T56FQAE',
    u'B0719HYML3',
    u'B07FJQ32BG',
    u'B00OJOFX2S',
    u'B01AMJCHB8',
    u'B00CYA4RIK',
    u'B00JKK3MAG',
    u'B0185PXU3K',
    u'B07DVRNCYS',
    u'B07J5Q6XXL',
    u'B07QR1299Y',
    u'B07GRF5H23',
    u'B00DS5BG52',
    u'B0016QX5RC',
    u'B00VX14U6K',
    u'B07RRYLSX9',
    u'B01M05I0PJ',
    u'B07PP31LC8',
    u'B07RX7WYBM',
    u'B079C73K48',
    u'B0797CG5R8',
    u'B07FWB52YG',
    u'B000GG6EU8',
    u'B00CDKAZY6',
    u'B07WW5W7L2',
    u'B00UNT5T04',
    u'B00DS5BGAC',
    u'B008PA091C',
    u'B00U3SMG8S',
    u'B07WFZY7BV',
    u'B01D6AQQQC',
    u'B008NC7SHU',
    u'B07VKHZ9MJ',
    u'B07Q6BV39V',
    u'B00ZQ0CKJI',
    u'B074H67KND',
    u'B072L3KTCH',
    u'B07B8TFKNK',
    u'B07CNC9CLD',
    u'B00VKHG5PG',
    u'B07PG