In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta, timezone
from matplotlib.backends.backend_pdf import PdfPages

In [2]:
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload

In [3]:
#%autoreload now

In [4]:
# Local modules
from db import getData
from pd_helpers import get_modified_isd_df, get_filtered_by_time, get_filtered_by_listed_time_and_quant, get_filtered_by_sellers, get_filtered_by_min_3_price_avg
from pd_styling import set_caption

In [None]:
# Get data
df = getData("SELECT t.* FROM isd_detailed_daily t")

In [None]:
df.sample(5)

In [None]:
df = get_modified_isd_df(df)

In [None]:
df.columns

In [None]:
# These are the columns we can perform aggregations on 
agg_cols = df.loc[0,"number_of_sellers":"avg_listed_time_new_3"].index.to_list()
agg_cols

In [None]:
sc_df = (
    pd
    .read_feather("set_components.ftr")
    .set_index("item_url_name")
)
sc_df.sample(3)

In [None]:
# Get all sets that meet certain criteria
sets_df = (
    get_filtered_by_time(df[df["set"] == True], days=3).groupby("item_url_name_rank").agg(dict.fromkeys(agg_cols, "median"))
    .pipe(get_filtered_by_min_3_price_avg, min_price=50)
    .pipe(get_filtered_by_sellers, min_sellers=2)
    .pipe(get_filtered_by_listed_time_and_quant, max_listed=500, min_quant=2)
    )

sets_indices = sets_df.index.values

In [None]:
sets_df

In [None]:
# The result df for Set-Component price difference
sc_diff_df = (
      pd
      .DataFrame(index=sets_df.index, columns=["comp_total", "set_total", "quantity", "sc_diff", "comp_listed_time", "set_listed_time"])
    )

# get a time filtered df used to get recent prices/times
dff = get_filtered_by_time(df, days=3).groupby("item_url_name_rank").agg(dict.fromkeys(agg_cols, "median"))

# loop through each set in the filtered group of sets meeting our criteria
for set_name in sets_indices:
    # Get the components of each set (with their quantity, i.e. how many are needed)
    set_components = sc_df[sc_df["set_root_url_name"] == set_name][["quantity"]]
    # Add some stats for each component
    set_components = set_components.assign(
      min_3_price_avg=lambda x: dff.loc[x.index,"min_3_price_avg"] * x.quantity,
      listed_time=lambda x: dff.loc[x.index,"avg_listed_time_new_3"]
    )
    # Make a row in the result df (the set-components df)
    sc_diff_df.loc[set_name] = [
      set_components["min_3_price_avg"].sum(),
      sets_df.loc[set_name,"min_3_price_avg"],
      set_components["quantity"].sum(),
      np.nan,
      round(set_components["listed_time"].mean(),1),
      dff.loc[set_name, "avg_listed_time_new_3"]
      ]
    sc_diff_df.loc[set_name, "sc_diff"] = sc_diff_df.loc[set_name, "set_total"] - sc_diff_df.loc[set_name, "comp_total"]

In [None]:
def style_worthy(v, props=''):
    return np.where(v > 30, props, "")

In [None]:
s = (
    sc_diff_df[sc_diff_df["quantity"] > 0]
     .sort_values("sc_diff", ascending=False)
     .head(10)
     .round(1)
     .style
     #.apply(style_worthy, props='color:green;', axis=0, subset="sc_diff")
     .background_gradient(cmap="GnBu_r", subset="sc_diff", low=0.7, high=0.3)
     .background_gradient(cmap="GnBu", subset="set_listed_time", low=0.2, high=.8)
)
set_caption(s, "Price differences between the total cost of components and the cost of the set.", loc="top")
s

In [None]:
sets_df.sort_values("avg_listed_time_new_3", ascending=False)