In [4]:
import os
import datetime
import pymongo
import pandas as pd
import bb_utils.time.utils as tu
from pandas.plotting import scatter_matrix

In [5]:
DB_NAME = 'amz_local_search'
MONGO_URI = os.environ.get('AMZ_MONGO_URI_TEMPLATE') % (os.environ.get('AMZ_MONGO_PASSWORD'), DB_NAME)

def connect_to_db(mongo_uri):
    client = pymongo.MongoClient(mongo_uri)
    return client.get_database()

db = connect_to_db(MONGO_URI)

In [6]:
query = 'carabiner clip'

max_asins_per_search = 20
rank_na_value = max_asins_per_search + 10

In [11]:
from amz_local_search.analysis import get_results_per_query, _replace_with_none

    
start_date = tu.utc_time() - datetime.timedelta(days=90)
rank_iter = get_results_per_query(db, query, max_asins_per_result=max_asins_per_search, start_date=start_date)

raw_rank_df = pd.DataFrame(
    rank_iter,
    columns=['date', 'state', 'rank', 'asin', 'avail_msg', 'get_by'])
raw_rank_df.head(3)

Unnamed: 0,date,state,rank,asin,avail_msg,get_by
0,2019-09-17,CO,1,B07CWKGZYL,,2019-09-18
1,2019-09-17,CO,2,B0719B4LNH,,2019-09-18
2,2019-09-17,CO,3,B07DQFQG26,,NaT


# Availability

In [12]:
raw_rank_df[pd.notnull(raw_rank_df['avail_msg'])]

Unnamed: 0,date,state,rank,asin,avail_msg,get_by
12256,2019-10-15,SC,17,B077695Q9D,Currently unavailable,NaT
12277,2019-10-15,WV,18,B077695Q9D,Currently unavailable,NaT
18691,2019-10-18,VA,12,B07BPVFPYS,Only 1 left in stock - order soon.,2019-10-21
19155,2019-10-18,MD,16,B077695Q9D,"In stock on October 25, 2019",NaT
19195,2019-10-18,GA,16,B07BPVFPYS,Only 1 left in stock - order soon.,2019-10-21
19497,2019-10-19,CA,18,B077695Q9D,"In stock on October 24, 2019",NaT
19815,2019-10-19,IL,16,B077695Q9D,"In stock on October 24, 2019",NaT
19836,2019-10-19,NC,17,B077695Q9D,"In stock on October 24, 2019",NaT
20075,2019-10-19,MN,16,B077695Q9D,"In stock on October 24, 2019",NaT
20255,2019-10-19,LA,16,B077695Q9D,"In stock on October 24, 2019",NaT


In [13]:
raw_rank_df[(raw_rank_df.asin=='B07BPVFPYS') & (raw_rank_df.date == '2019-10-18') ]

Unnamed: 0,date,state,rank,asin,avail_msg,get_by
18459,2019-10-18,MA,20,B07BPVFPYS,,2019-10-23
18691,2019-10-18,VA,12,B07BPVFPYS,Only 1 left in stock - order soon.,2019-10-21
19195,2019-10-18,GA,16,B07BPVFPYS,Only 1 left in stock - order soon.,2019-10-21


# Globals rank

In [15]:
report = {}

# Create a new data frame where each ASIN from the top N results has
# a rank value for every observed state/date pair, so even if the ASIN
# fall beyond the top N results on that date, it will still have a special
# low rank value to enforce rank variability.
#

# NOTE: This will ignore intra-day variations.
rank_df = raw_rank_df.groupby(['state', 'date',
                               'asin']).agg({'rank': 'mean'})

rank_df = rank_df.unstack('asin')['rank'].fillna(rank_na_value).stack(
    'asin')

# Aggregated metrics per ASIN.
#

rank_per_asin = rank_df.groupby(['asin']).agg(
    ['mean', 'std',
     'min']).sort_values(['mean'], ascending=True)[:max_asins_per_search]
report['rank_per_asin'] = _replace_with_none(
    rank_per_asin).reset_index().to_dict(orient='records')

# Aggregate metrics per state.
#

var_per_asin_in_state = rank_df.groupby(['state', 'asin']).std()
total_var_per_state = var_per_asin_in_state.unstack('state').sum()

# Normalize using an empirical weight.
norm_weight = max_asins_per_search * rank_na_value * 0.9
var_per_state = (total_var_per_state /
                 norm_weight).sort_values(ascending=False)

In [18]:
var_per_state.agg(['mean', 'median', 'max', 'min']).to_dict()

{'max': 0.5056768273096222,
 'mean': 0.26127087553273204,
 'median': 0.2580383235254625,
 'min': 0.11260424681169723}