In [16]:
import os
import datetime
import pymongo
import pandas as pd
import bb_utils.time.utils as tu
from pandas.plotting import scatter_matrix

In [17]:
DB_NAME = 'amz_local_search'
MONGO_URI = os.environ.get('AMZ_MONGO_URI_TEMPLATE') % (os.environ.get('AMZ_MONGO_PASSWORD'), DB_NAME)

def connect_to_db(mongo_uri):
    client = pymongo.MongoClient(mongo_uri)
    return client.get_database()

db = connect_to_db(MONGO_URI)

In [18]:
query = 'learning resources'
query = 'learning resources'

max_asins_per_search = 20
rank_na_value = max_asins_per_search + 10

In [19]:
from amz_local_search.analysis import get_results_per_query, _replace_with_none

    
start_date = tu.utc_time() - datetime.timedelta(days=90)
end_date = tu.utc_time()
rank_iter = get_results_per_query(db, query, max_asins_per_result=max_asins_per_search, start_date=start_date)

raw_rank_df = pd.DataFrame(
    rank_iter,
    columns=['date', 'state', 'rank', 'asin', 'avail_msg', 'get_by'])
raw_rank_df.head(3)

Unnamed: 0,date,state,rank,asin,avail_msg,get_by
0,2019-12-22,MI,1,B07P8WBK9F,,2019-12-24
1,2019-12-22,MI,2,B01N6L9JK2,,2019-12-24
2,2019-12-22,MI,3,B00B2B0I62,,2019-12-24


In [20]:
meta = {}
meta['num_days'] = raw_rank_df['date'].nunique() if len(raw_rank_df) else 0
meta['start_date'] = start_date
meta['end_date'] = end_date

In [25]:
# NOTE: This will ignore intra-day variations.
rank_df = raw_rank_df.groupby(['state', 'date',
                               'asin']).agg({'rank': 'mean'})

rank_df = rank_df.unstack('asin')['rank'].fillna(rank_na_value).stack(
    'asin')

# Aggregated metrics per ASIN.
#

rank_per_asin = rank_df.groupby(['asin']).agg(
    ['mean', 'std',
     'min']).sort_values(['mean'], ascending=True)[:max_asins_per_search]

rank_per_asin.head(3)

Unnamed: 0_level_0,mean,std,min
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B07P8WBK9F,2.326531,4.714286,1.0
B01N6L9JK2,3.091837,3.676606,1.0
B00B2B0I62,3.877551,2.681982,2.0


In [29]:
var_per_asin_in_state = rank_df.groupby(['state', 'asin']).std()
var_per_asin_in_state.head(3)

state  asin      
AK     B00000DMCE   NaN
       B00000DMD2   NaN
       B00000JGWY   NaN
dtype: float64

In [42]:
# Aggregate metrics per state.
#

var_per_asin_in_state = rank_df.groupby(['state', 'asin']).std()
total_var_per_state = var_per_asin_in_state.unstack('state').sum()

# Normalize using an empirical weight.
norm_weight = max_asins_per_search * rank_na_value * 0.9
var_per_state = (total_var_per_state /
                 norm_weight).sort_values(ascending=False)

var_per_state.head(3)

state
WY    0.0
MO    0.0
MI    0.0
dtype: float64

In [43]:
norm_weight = (len(rank_per_asin) * rank_na_value * 0.9)
rank_per_asin['std'].sum() / norm_weight

0.16342104693375972