In [1]:
import os
import pymongo
import pandas as pd
import bb_utils.time.utils as tu

In [2]:
DB_NAME = 'amz_local_search'
MONGO_URI = os.environ.get('AMZ_MONGO_URI_TEMPLATE') % (os.environ.get('AMZ_MONGO_PASSWORD'), DB_NAME)

def connect_to_db(mongo_uri):
    client = pymongo.MongoClient(mongo_uri)
    return client.get_database()

db = connect_to_db(MONGO_URI)

In [3]:
from amz_local_search.analysis import _get_historical_data_iter, _replace_with_none, _num_series_to_js_series, _to_js_ts

query = 'carabiner clip'
asin = 'B0719B4LNH'

data_filter = {
    'tag': 'device',
    'device_type': 'desktop'
}
rank_iter = _get_historical_data_iter(db, user_id=None, query=query, asin=asin, data_filter=data_filter)

raw_rank_df = pd.DataFrame(rank_iter,
                           columns=[
                               'date', 'state', 'rank', 'avail_msg',
                               'get_by_date', 'device_type'
                           ])
raw_rank_df['device_type'].fillna('mobile', inplace=True)
raw_rank_df.head(5)

Unnamed: 0,date,state,rank,avail_msg,get_by_date,device_type
0,2020-01-02,FL,3,,2020-01-04 00:00:00,desktop
1,2020-01-02,CA,3,,2020-01-03 00:00:00,desktop
2,2020-01-03,FL,3,,2020-01-06 00:00:00,desktop
3,2020-01-05,FL,2,,2020-01-07 00:00:00,desktop
4,2020-01-05,CA,2,,2020-01-07 00:00:00,desktop


In [23]:
from amz_local_search.analysis import normalize_rank_df
report = {}

daily_rank_df, rank_df_with_state_cols = normalize_rank_df(raw_rank_df)

daily_rank_df

Unnamed: 0,state,date,get_by_date,avail_msg,rank
0,CA,2020-01-02,2020-01-03 00:00:00,,3.0
1,FL,2020-01-02,2020-01-04 00:00:00,,3.0
2,CA,2020-01-03,,,2.666667
3,FL,2020-01-03,2020-01-06 00:00:00,,3.0
4,CA,2020-01-05,2020-01-07 00:00:00,,2.0
5,FL,2020-01-05,2020-01-07 00:00:00,,2.0


In [24]:
ranks_by_state['date'].transform(
                                   'max')

0   2020-01-05
1   2020-01-05
2   2020-01-05
3   2020-01-05
4   2020-01-05
5   2020-01-05
Name: date, dtype: datetime64[ns]

In [25]:
ranks_by_state = daily_rank_df.groupby(['state'])
ranks_by_date = daily_rank_df.groupby('date')

# This will contain only rows recorded on the latest date for each
# state.
last_per_state = daily_rank_df[daily_rank_df['date'] ==
                               ranks_by_state['date'].transform(
                                   'max')].groupby('state')

last_per_state.head(10)

Unnamed: 0,state,date,get_by_date,avail_msg,rank
4,CA,2020-01-05,2020-01-07 00:00:00,,2.0
5,FL,2020-01-05,2020-01-07 00:00:00,,2.0


In [20]:
agg_df = ranks_by_state['rank'].agg(
    ['mean', 'std', 'min', 'max', 'median'])

# Record an average rank on the latest date. If we just do agg('last'),
# it will simply contain the last data point.
agg_df['last'] = last_per_state['rank'].agg('mean')
agg_df['get_by'] = last_per_state['get_by_date'].agg('last').map(
    lambda v: 1000 * tu.unix_seconds(v) if pd.notnull(v) else None)
agg_df['avail_msg'] = last_per_state['avail_msg'].agg('last')


agg_df['last'].agg(
        ['mean', 'std', 'min', 'max', 'median'])

mean      2.0
std       0.0
min       2.0
max       2.0
median    2.0
Name: last, dtype: float64

In [21]:
agg_df

Unnamed: 0_level_0,mean,std,min,max,median,last,get_by,avail_msg
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CA,2.555556,0.509175,2.0,3.0,2.666667,2.0,1578355000000.0,
FL,2.666667,0.57735,2.0,3.0,3.0,2.0,1578355000000.0,


In [28]:
 pivoted_rank_df = _replace_with_none(rank_df_with_state_cols)

rank_chart_data = []
for state in pivoted_rank_df:
    ranks = pivoted_rank_df[state]
    data = _num_series_to_js_series(ranks.iteritems())
    rank_chart_data.append({'name': state, 'data': data})

# TODO: This needs to be computed over the time series with
# the missing values filled.
rank_df_indicators = ranks_by_date['rank'].agg(
    ['min', 'max', 'mean', 'median'])

rank_chart_indicators = {'range': [], 'average': [], 'median': []}
for ts, row in rank_df_indicators.iterrows():
    ts_js = _to_js_ts(ts)
    rank_chart_indicators['range'].append([ts_js, row['min'], row['max']])
    rank_chart_indicators['average'].append([ts_js, row['mean']])
    rank_chart_indicators['median'].append([ts_js, row['median']])

report = {}
report['aggRanks'] = _replace_with_none(agg_df).reset_index().to_dict(
    orient='records')
report['rankHistory'] = rank_chart_data
report['rankHistoryIndicators'] = rank_chart_indicators
report['rankHistoryIndicators']

{'average': [[1577923200000.0, 3.0],
  [1578009600000.0, 2.8333333333333335],
  [1578182400000.0, 2.0]],
 'median': [[1577923200000.0, 3.0],
  [1578009600000.0, 2.8333333333333335],
  [1578182400000.0, 2.0]],
 'range': [[1577923200000.0, 3.0, 3.0],
  [1578009600000.0, 2.666666666666667, 3.0],
  [1578182400000.0, 2.0, 2.0]]}