In [71]:
%matplotlib notebook

import pandas as pd
import numpy as np
from pprint import pprint

import ..search_task as search_task 

In [72]:
SEARCH_FILE_NAME = '/Users/evgenyp/Projects/AmzJet/AmzJet/amz-apps/amz_local_search/resources/amazon/search-carabiner+clip-3p-3s.json.gz'

state_to_results = search_task.group_search_results_from_file(SEARCH_FILE_NAME)

# Total results per state

In [73]:
def get_total_matches_per_state_df(state_to_results):
    for state, search_samples in state_to_results.items():
        for sample_id, paged_results in search_samples.items():
            for page_num, page in sorted(paged_results.items(), key=lambda item: item[0]):
                yield state, page['total_items']
                break
            
total_matches_df = pd.DataFrame(get_total_matches_per_state_df(state_to_results), columns=['state', 'items'])
total_matches_df.head(5)

Unnamed: 0,state,items
0,WA,11915
1,WA,11915
2,WA,14716
3,DE,11913
4,DE,14717


In [74]:
total_matches_df.groupby('state')['items'].mean().describe()

count       46.000000
mean     13488.605072
std        805.595800
min      11913.000000
25%      12847.083333
50%      13639.166667
75%      14269.750000
max      14715.666667
Name: items, dtype: float64

# Positions per state

In [102]:
def make_dataframe(state_to_results, asins):
    def make_row(state, page, item, pos):
        return [
            item['asin'], state, pos, page['total_items'],
            item.get('get_it_by_date'),
            item.get('availability_message'),
            item.get('stock_count')
        ]

    # TODO: It will not return any rows, when search results are missing for a
    # certain state due to proxy issues.
    def generate_rows():
        for state, search_samples in state_to_results.items():
            unmachted_asins_across_samples = set(asins)
            max_pos = 0

            for sample_id, paged_results in search_samples.items():
                pos = 0

                unmachted_asins_within_sample = set(asins)

                for page_num, page in sorted(paged_results.items(), key=lambda item: item[0]):
                    for item in page['items']:
                        pos += 1
                        max_pos = max(pos, max_pos)
                        if item['asin'] in unmachted_asins_within_sample:
                            unmachted_asins_within_sample.remove(item['asin'])
                            unmachted_asins_across_samples.discard(item['asin'])
                            yield make_row(state, page, item, pos)

            for missied_asin in unmachted_asins_across_samples:
                yield make_row(state, page, {'asin': missied_asin}, -max_pos)

    headers = ['asin', 'state', 'pos', 'total', 'get_by', 'stock_msg', 'stock_count']
    df = pd.DataFrame(generate_rows(), columns=headers)
    # df.set_index([df.pop('asin'), df.pop('state')], inplace=True)
    # df.sort_index(inplace=True)
    return df
            
asins = set(['B0719B4LNH', 'B07P2VF8DN'])

pos_df = make_dataframe(state_to_results, asins)
# pos_df
pos_df.sort_values(by=['asin', 'state']).to_dict(orient='records')

[{'asin': u'B0719B4LNH',
  'get_by': Timestamp('2019-08-23 00:00:00'),
  'pos': 1,
  'state': u'AK',
  'stock_count': None,
  'stock_msg': None,
  'total': 14280},
 {'asin': u'B0719B4LNH',
  'get_by': Timestamp('2019-08-23 00:00:00'),
  'pos': 1,
  'state': u'AK',
  'stock_count': None,
  'stock_msg': None,
  'total': 14703},
 {'asin': u'B0719B4LNH',
  'get_by': Timestamp('2019-08-23 00:00:00'),
  'pos': 1,
  'state': u'AK',
  'stock_count': None,
  'stock_msg': None,
  'total': 14703},
 {'asin': u'B0719B4LNH',
  'get_by': NaT,
  'pos': 1,
  'state': u'AL',
  'stock_count': None,
  'stock_msg': None,
  'total': 12778},
 {'asin': u'B0719B4LNH',
  'get_by': NaT,
  'pos': 1,
  'state': u'AL',
  'stock_count': None,
  'stock_msg': None,
  'total': 10714},
 {'asin': u'B0719B4LNH',
  'get_by': NaT,
  'pos': 1,
  'state': u'AL',
  'stock_count': None,
  'stock_msg': None,
  'total': 13178},
 {'asin': u'B0719B4LNH',
  'get_by': Timestamp('2019-08-19 00:00:00'),
  'pos': 1,
  'state': u'AR',
  

In [104]:
df = pos_df.groupby(['asin', 'state']).agg({'pos': ['min', 'max', 'mean', 'std'], 'get_by': ['min'], 'total': ['min']})
df.columns = ['_'.join(col).strip() for col in df.columns.values]
df

df.reset_index().astype(object).where(pd.notnull(), None).to_dict(orient='records')
#df.reset_index().to_dict(orient='records')
#asin_filter = df.index.get_level_values('asin') == 'B075D83YY5'
#df[asin_filter]

TypeError: notna() takes exactly 1 argument (0 given)

In [107]:
asin_df = pos_df.groupby(['asin']).agg({'pos': ['min', 'max', 'mean', 'std'], 'get_by': ['min'], 'total': ['min']})
asin_df.columns = ['_'.join(col).strip() for col in asin_df.columns.values]
asin_df

Unnamed: 0_level_0,total_min,pos_min,pos_max,pos_mean,pos_std,get_by_min
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B0719B4LNH,10714,1,21,1.903704,3.46383,2019-08-18
B07P2VF8DN,10716,-145,136,73.036036,71.589096,NaT


# Outliers

In [23]:
means_df = pos_df.groupby('state').mean()
means_df[means_df['pos'] > means_df['pos'].mean() + 1 * means_df['pos'].std()]

Unnamed: 0_level_0,pos,total
state,Unnamed: 1_level_1,Unnamed: 2_level_1
AL,144.0,54333.666667
NH,144.0,54344.0
NJ,140.0,54588.0
NY,145.0,54469.0
RI,142.666667,54499.333333
