In [3]:
import os

import pymongo

import pandas as pd
import matplotlib.pyplot as plt
import bb_utils.time.utils as tu

In [4]:
DB_NAME = 'amz_local_search'
MONGO_URI = os.environ.get('AMZ_MONGO_URI_TEMPLATE') % (os.environ.get('AMZ_MONGO_PASSWORD'), DB_NAME)

def connect_to_db(mongo_uri):
    client = pymongo.MongoClient(mongo_uri)
    return client.get_database()

In [5]:
db = connect_to_db(MONGO_URI)

In [6]:
NUM_STATES = 50
NUM_PAGES = 3

available_results = list(db.search_pages.find({'args.query': 'carabiner clip'}))
print('Total pages: ', len(available_results))
print('Days covered: ', len(available_results) / NUM_STATES / NUM_PAGES / 2)

('Total pages: ', 101539)
('Days covered: ', 338)


In [7]:
def repalce_with_none(df):
    return df.astype(object).where(df.notnull(), None)

In [8]:
def _get_historical_data_iter(db, user_id, query, asin):
    ranks = db.search_pages.aggregate([
        {
            '$match': {
                'args.query': query,
                'result.asins': asin,
                'args.max_pages': {
                    '$gte': 3
                }
            }
        },
        {
            '$unwind': {
                'path': '$result.asins',
                'includeArrayIndex': 'index'
            }
        },
        {
            '$match': {
                'result.asins': asin
            }
        },
        {
            '$project': {
                'asin': '$result.asins',
                'loc': '$args.ip_loc',
                'meta': '$result.meta',
                'rank': {
                    '$add': ["$result.meta.first_rank", "$index"]
                }
            }
        },
        {
            '$sort': {
                'loc.state': 1
            }
        },
    ])

    for rank_data in ranks:
        yield (rank_data['meta']['date'], rank_data['loc']['state'],
               rank_data['rank'])
    
rank_df = pd.DataFrame(_get_historical_data_iter(db, None, 'swords for kids', 'B00EAHXP1U'), columns=['date', 'state', 'rank'])
rank_df.head(3)

Unnamed: 0,date,state,rank


In [9]:
rank_df.tail(20)

Unnamed: 0,date,state,rank


In [10]:
pivoted_rank_df = rank_df.pivot_table(index='state',
                                      columns='date',
                                      values='rank')

pivoted_rank_df = repalce_with_none(pivoted_rank_df)

rank_chart_data = []
for state, ranks in pivoted_rank_df.iterrows():
    data = []
    for ts, rank in ranks.iteritems():
        data.append([tu.unix_seconds(ts) * 1000, rank])
    rank_chart_data.append({'name': state, 'data': data})
    
rank_chart_data

DataError: No numeric types to aggregate

In [11]:
df.pivot_table(index='date', columns='state', values='rank').plot(style='.-', figsize=(20,10))

NameError: name 'df' is not defined

In [12]:
df.pivot_table(index='date', columns='state', values='rank').plot(style='.-', figsize=(20,10))

NameError: name 'df' is not defined

In [13]:
df.pivot_table(index='date', columns='state', values='rank').resample('D').interpolate(method='cubic').plot()

NameError: name 'df' is not defined

In [14]:
df.pivot_table(index='date', columns='state', values='rank')

NameError: name 'df' is not defined

In [15]:
def repalce_with_none(df):
    return df.astype(object).where(df.notnull(), None)

df = pd.DataFrame(get_historical_data_iter(db, 'carabiner clip', 'B07P2VF8DN'), columns=['date', 'state', 'rank'])
repalce_with_none(df.pivot_table(index='state', columns='date', values='rank').reset_index()).to_dict(orient='records')

NameError: name 'get_historical_data_iter' is not defined

In [16]:
def _get_historical_data_iter(db, query, asin):
    asins_to_match = [asin]
    ranks = db.search_pages.aggregate([
        {
            '$match': {
                'result.asins': {
                    '$in': asins_to_match
                }
            }
        },
        {
            '$unwind': {
                'path': '$result.asins',
                'includeArrayIndex': 'index'
            }
        },
        {
            '$match': {
                'result.asins': {
                    '$in': asins_to_match
                }
            }
        },
        {
            '$project': {
                'asin': '$result.asins',
                'loc': '$args.ip_loc',
                'meta': '$result.meta',
                'rank': {
                    '$add': ["$result.meta.first_rank", "$index"]
                }
            }
        },
        {
            '$sort': {
                'loc.state': 1
            }
        },
    ])

    for rank_data in ranks:
        yield (rank_data['meta']['date'], rank_data['loc']['state'], rank_data['rank'])



query = 'carabiner clip'
asin = 'B07P2VF8DN'

rank_iter = _get_historical_data_iter(db, query, asin)
rank_df = pd.DataFrame(rank_iter, columns=['date', 'state', 'rank'])

repalce_with_none(rank_df.groupby('state')['rank'].agg(['last', 'mean', 'std', 'min', 'max'])).reset_index().to_dict(orient='records')

[{'last': 135,
  'max': 142,
  'mean': 95.0,
  'min': 6,
  'state': u'AK',
  'std': 53.35876586594598},
 {'last': 128,
  'max': 138,
  'mean': 89.5925925925926,
  'min': 11,
  'state': u'AL',
  'std': 45.613981712138745},
 {'last': 136,
  'max': 143,
  'mean': 103.29166666666667,
  'min': 6,
  'state': u'AR',
  'std': 44.553318489264036},
 {'last': 143,
  'max': 143,
  'mean': 97.13793103448276,
  'min': 6,
  'state': u'AZ',
  'std': 48.47143793582466},
 {'last': 109,
  'max': 140,
  'mean': 67.39285714285714,
  'min': 6,
  'state': u'CA',
  'std': 52.68181777680514},
 {'last': 98,
  'max': 142,
  'mean': 101.56,
  'min': 11,
  'state': u'CO',
  'std': 49.00942086306809},
 {'last': 104,
  'max': 143,
  'mean': 93.38461538461539,
  'min': 11,
  'state': u'CT',
  'std': 46.886310943026366},
 {'last': 98,
  'max': 135,
  'mean': 83.31818181818181,
  'min': 11,
  'state': u'DE',
  'std': 46.75201687896558},
 {'last': 143,
  'max': 143,
  'mean': 69.42028985507247,
  'min': 6,
  'state': u'

In [17]:
agg_df = rank_df.groupby('state')['rank'].agg(['last', 'mean', 'std', 'min', 'max'])
agg_df.describe()

#agg_df['is_outlier'] = agg_df[]
#agg_df.describe()

Unnamed: 0,last,mean,std,min,max
count,50.0,50.0,50.0,50.0,50.0
mean,109.74,94.523033,48.276552,7.5,141.86
std,28.301446,9.035824,3.562372,2.31455,2.040508
min,46.0,67.392857,37.321157,6.0,135.0
25%,93.0,89.998792,46.500031,6.0,141.0
50%,119.0,96.845926,48.49199,6.0,143.0
75%,131.75,100.691071,50.428383,11.0,143.0
max,143.0,107.384615,54.772096,11.0,144.0


In [18]:
agg_df['last'].agg(['last', 'mean', 'std', 'min', 'max']).to_dict()



{'max': 143.0, 'mean': 109.74, 'min': 46.0, 'std': 28.301445841770946}

In [19]:
agg_df[agg_df['last'] == agg_df['last'].min()]

Unnamed: 0_level_0,last,mean,std,min,max
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WA,46,81.580645,42.49296,6,139


In [20]:
agg_df[agg_df['last'] == agg_df['last'].max()].to_dict(orient='records')

[{'last': 143,
  'max': 143,
  'mean': 97.13793103448276,
  'min': 6,
  'std': 48.47143793582466},
 {'last': 143,
  'max': 143,
  'mean': 69.42028985507247,
  'min': 6,
  'std': 54.77209621188657},
 {'last': 143,
  'max': 143,
  'mean': 81.47368421052632,
  'min': 6,
  'std': 51.649855782365755},
 {'last': 143,
  'max': 143,
  'mean': 100.47826086956522,
  'min': 11,
  'std': 46.630919868120074},
 {'last': 143,
  'max': 143,
  'mean': 93.78260869565217,
  'min': 11,
  'std': 52.966504442764354}]