In [2]:
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
pd.options.display.float_format = '{:.2f}'.format

In [3]:
with open('data/spring_2023/full_df.pkl', 'rb') as file:
    full_df = pkl.load(file)

len(full_df)

1247140

In [4]:

full_df = full_df.drop_duplicates(subset = ['item_name', 'store_id']) # need this because the same item might be listed under multiple categories
len(full_df)

full_df['state'], full_df['city'] = zip(*full_df['location_url'].str.split('/').str[3:5])

In [5]:

core_items = [
              'Supreme Taco Party Pack', 
              # 'Chicken Quesadilla', 
              # 'cinnabon delights 12 pack', 
              # 'Crunchwrap SupremeÂ®', 
            #   'beefy 5 layer burrito', 
            #   'Nacho cheese doritos locos taco', 
              # '3 doritos locos tacos combo'
              ]

working = full_df[full_df['item_name'].isin(core_items)]

In [6]:
agg_funcs = {
    'mean': 'mean',
    'median': 'median',
    'num_stores': 'size',
    'nunique_prices': 'nunique',
    'max': 'max',
    'min': 'min',
    'sd': 'std'
}

########## in aggregate

# metric = 'median'
metric = 'mean'

working .groupby('state')['value'].agg(**agg_funcs)

state_distro = working.groupby(['state', 'item_name'])['value'].agg(**agg_funcs).sort_values(by = ['item_name', metric], ascending = False).reset_index()


In [13]:
### median and percentiles

fig = px.histogram(state_distro, x='mean', nbins = 20, title='Histogram of Mean Values by State')
# Calculate statistics
mean_value = state_distro['mean'].mean()
median_value = state_distro['mean'].median()
percentile_25 = state_distro['mean'].quantile(0.25)
percentile_75 = state_distro['mean'].quantile(0.75)

# Add vertical lines for mean, median, 25th, and 75th percentiles
# fig.add_vline(x=mean_value, line=dict(color='blue', dash='dash'), name='Mean')
fig.add_vline(x=median_value, line=dict(color='green', dash='dash'), name='Median')
fig.add_vline(x=percentile_25, line=dict(color='red', dash='dash'), name='25th Percentile')
fig.add_vline(x=percentile_75, line=dict(color='purple', dash='dash'), name='75th Percentile')

# Add annotations for clarity
# fig.add_annotation(x=mean_value, y=0, text="Mean", showarrow=True, arrowhead=1, ax=0, ay=-40)
fig.add_annotation(x=median_value, y=0, text="Median", showarrow=True, arrowhead=1, ax=0, ay=-40)
fig.add_annotation(x=percentile_25, y=0, text="25th Percentile", showarrow=True, arrowhead=1, ax=0, ay=-40)
fig.add_annotation(x=percentile_75, y=0, text="75th Percentile", showarrow=True, arrowhead=1, ax=0, ay=-40)


fig.show()
# state_distro#.reset_index(drop = True)

In [19]:
### mean and percents away
fig = px.histogram(state_distro, x='mean', nbins = 20, title='Histogram of Mean Values by State')
# Calculate statistics
mean_value = state_distro['mean'].mean()
# median_value = state_distro['mean'].median()
plus_val = mean_value * 1.2
minus_val = mean_value * .8

pct_data_in_range = len(state_distro[(state_distro['mean'] >= minus_val) & (state_distro['mean'] <= plus_val) ]) / len(state_distro)

fig.add_vline(x=median_value, line=dict(color='green', dash='dash'), name='Mean')
fig.add_vline(x=plus_val, line=dict(color='red', dash='dash'), name='25th Percentile')
fig.add_vline(x=minus_val, line=dict(color='purple', dash='dash'), name='75th Percentile')

# Add annotations for clarity
fig.add_annotation(x=mean_value, y=0, text="Mean", showarrow=True, arrowhead=1, ax=0, ay=-40)
fig.add_annotation(x=median_value, y=0, text="Mean", showarrow=True, arrowhead=1, ax=0, ay=-40)
fig.add_annotation(x=plus_val, y=0, text="plus value", showarrow=True, arrowhead=1, ax=0, ay=-40)
fig.add_annotation(x=minus_val, y=0, text="minus value", showarrow=True, arrowhead=1, ax=0, ay=-40)

fig.show()