In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gmaps
import gmaps.datasets
import ipyleaflet
from IPython.display import display

In [None]:
#importing environment variables
from dotenv import dotenv_values
env_variables = dotenv_values('Ignore.env')

In [None]:
df = pd.read_csv('/Users/emilydanielbowser/Documents/Iowa Food Coop/Data/Intermediate Data/Wrangled_data')

In [None]:
df.head()

In [None]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)


In [None]:
df.shape

In [None]:
df.columns

In [None]:
#Filling in all NAs from the different sales categories with 0's
df[['SaleNom', 'Baked Goods', 'Beverages', 'Classes and Events',
       'Condiments + Sauces', 'Dairy', 'Dried Herbs + Spices', 'Eggs',
       'Grains, Flours, Cereal + Pastas', 'Handmade Home Goods + Gifts',
       'Honey, Syrups, Jams + Jellies', 'Iowa Food Co-op Shop',
       'Local Produce', 'Meat - Beef', 'Meat - Chicken + Capon', 'Meat - Pork',
       'Meats - Other', 'Non-Food Items', 'Nuts', 'Other Protein Sources',
       'Personal Care', 'Pet + Animal Care', 'Prepared Foods', 'Snacks',
       'The Garden Center']]=df[['SaleNom', 'Baked Goods', 'Beverages', 'Classes and Events',
       'Condiments + Sauces', 'Dairy', 'Dried Herbs + Spices', 'Eggs',
       'Grains, Flours, Cereal + Pastas', 'Handmade Home Goods + Gifts',
       'Honey, Syrups, Jams + Jellies', 'Iowa Food Co-op Shop',
       'Local Produce', 'Meat - Beef', 'Meat - Chicken + Capon', 'Meat - Pork',
       'Meats - Other', 'Non-Food Items', 'Nuts', 'Other Protein Sources',
       'Personal Care', 'Pet + Animal Care', 'Prepared Foods', 'Snacks',
       'The Garden Center']].fillna(0)

In [None]:
#Need to sort df oldest to newest first as we start making columns that cumulate over time.
df = df.sort_values(by=['IDCyc'], ascending=True)

In [None]:
#Creating a Cumulative Sum column to add customer orders as they go since we can't use total sales in our final model.
df['Cumulative_Sum'] = df['SaleNom'].groupby(df['IDMemb']).cumsum()

In [None]:
#Column of 1s and 0s for whether or not a person ordered during that ordering period
df['Ordered'] = df['SaleNom'].apply((lambda x: 1 if x>0 else 0))

In [None]:
#Summing the new column to have a cumulative sum of total orders for each member.
df['Cumulative_Number_of_Orders'] = df['Ordered'].groupby(df['IDMemb']).cumsum()

In [None]:
#Calculating average order when the person orders. 
df['order_per_cycle_when_ordering'] = df['Cumulative_Sum']/df['Cumulative_Number_of_Orders']

In [None]:
df.sort_values(by=['IDCyc','IDMemb'], ascending=True,inplace = True)

In [None]:
#Cumulative count of how many cycles this person has been a member.
df['Cycles_as_member'] = df.groupby('IDMemb')['IDMemb'].cumcount() + 1

In [None]:
#Calulate how much each customer orders per cycle by dividing their total orders by the number of cycles they have been a member.
df['order_per_cycle'] = df['Cumulative_Sum']/df['Cycles_as_member']

In [None]:
df.columns

In [None]:
#Dropping people who are out of state and rarely order since they are outliers
df = df[df['Distance_to_pickup']<200]

In [None]:
df.isna().sum()

In [None]:
df['order_per_cycle_when_ordering'] = df['order_per_cycle_when_ordering'].fillna(0)

In [None]:
df[df['latitude'].isna()]

In [None]:
#These two are test members. Deleting them.
df = df[(df['IDMemb']!=1347) & (df['IDMemb']!=1351)]

In [None]:
#Most people have never ordered. I'll remove these people from the dataframe using this mask. This will also mean that the model will not predict someone
#making their first order. After a person has ordered once, this model will start to predict their order amount. Could look at this and increase it to 
#making someone order 4 times before predicting to make sure we have more information before predicting.
orderers_only = df['IDMemb'][(df['IDCyc']==334) &(df['Cumulative_Sum']>0)]
df = df[df['IDMemb'].isin(orderers_only)]

In [None]:
df.describe()

In [None]:
from collections.abc import Iterable

In [None]:
# Map of customers who ordered during Cycle 334
gmaps.configure(env_variables['API_KEY2'])
fig = gmaps.figure()
p = df[df['IDCyc']==334]
locations = p[['latitude','longitude']]
heatmap_layer = gmaps.heatmap_layer(locations)
fig.add_layer(heatmap_layer)
#markers = gmaps.marker_layer(pickup_locs)
#fig.add_layer(markers)

heatmap_layer.max_intensity = 50
heatmap_layer.point_radius = 3

fig

In [None]:
# Map of all customers who have ordered since June of 2021
gmaps.configure(env_variables['API_KEY2'])
fig = gmaps.figure()
locations = df[['latitude','longitude']]
heatmap_layer = gmaps.heatmap_layer(locations)
fig.add_layer(heatmap_layer)
#markers = gmaps.marker_layer(pickup_locs)
#fig.add_layer(markers)

heatmap_layer.max_intensity = 50
heatmap_layer.point_radius = 3

fig

In [None]:
df.columns

In [None]:
#Sales during most recent ordering period.
plt.subplots(figsize=(12, 8))
sns.boxplot(x='CDLocLast', y='SaleNom', data=df[(df['IDCyc']==334) & (df['SaleNom']>0)])
plt.xticks(rotation='vertical')
plt.ylabel('Sales During Cycle 334')
plt.xlabel('Pick up Location');

In [None]:
#Lifetime sales by customer and pick up location.
plt.subplots(figsize=(12, 8))
sns.boxplot(x='CDLocLast', y='Cumulative_Sum', data=df[df['IDCyc']==334])
plt.xticks(rotation='vertical')
plt.ylabel('Sales over the Past Two Years')
plt.xlabel('Pick up Location');

In [None]:
df.columns

In [None]:
#Number of Customers and how long they have been members
plt.hist(data=df[df['IDCyc']==334], x = 'Years_member')
plt.ylabel('Number of Active Members')
plt.xlabel('Years as a Member')

In [None]:
#Histogram of who is ordering recently--new members vs. old members.
plt.hist(data=df[(df['IDCyc']==334) & df['SaleNom']>0], x = 'Years_member')
plt.ylabel('Number of Members Ordered for Cylce 334')
plt.xlabel('Years as a Member')

In [None]:
#Scatterplot showing amount of most recent order vs. how many times the customer has ordered in the past.
sns.scatterplot(data = df[df['IDCyc']==334], x = 'Cumulative_Number_of_Orders', y='SaleNom')
plt.ylabel('Cycle 334 Sales')
plt.xlabel('# of Times Ordered')

In [None]:
#Scatterplot of order vs. miles to pickup
sns.scatterplot(data = df[(df['Distance_to_pickup']<50) & (df['IDCyc']==334)], x = 'Distance_to_pickup', y='SaleNom')
plt.ylabel('Cycle 334 Sales')
plt.xlabel('Distance to pickup (miles)')

In [None]:
#Scatterplot of order vs. miles to pickup at Franklin location
sns.scatterplot(data = df[(df['CDLocLast']=='FRAN') & (df['IDCyc']==334)], x = 'Distance_to_pickup', y='SaleNom')
plt.ylabel('Cycle 334 Sales')
plt.xlabel('Distance to pickup (miles)')

In [None]:
#Scatterplot of order vs. miles to pickup at West Des Moines location
sns.scatterplot(data = df[(df['CDLocLast']=='WDM') & (df['IDCyc']==334)], x = 'Distance_to_pickup', y='SaleNom')
plt.ylabel('Cycle 334 Sales')
plt.xlabel('Distance to pickup (miles)')

In [None]:
#Scatterplot of order vs. miles to pickup at Ankeny location
sns.scatterplot(data = df[(df['CDLocLast']=='ANK') & (df['IDCyc']==334)], x = 'Distance_to_pickup', y='SaleNom')
plt.ylabel('Cycle 334 Sales')
plt.xlabel('Distance to pickup (miles)')

In [None]:
#Scatter plot of most recent ordering period vs. lifetime sales
sns.scatterplot(data = df[df['IDCyc']==334], x = 'Cumulative_Sum', y='SaleNom')
plt.ylabel('Cycle 334 Sales')
plt.xlabel('Total Sales')

In [None]:
#Total Years as member vs. total purchases over last 2+ years.
sns.scatterplot(data = df[df['IDCyc']==334], x = 'Years_member', y='Cumulative_Sum');
plt.ylabel('Total Purchases Since Cycle 285');
plt.xlabel('Years as a Member');

In [None]:
df.columns

In [None]:
#order per cycle comparison
sns.scatterplot(data = df[df['IDCyc']==334], x='IDMemb', y = 'order_per_cycle')

In [None]:
#How much customers spend when they order.
sns.scatterplot(data = df[df['IDCyc']==334], x='IDMemb', y = 'order_per_cycle_when_ordering')

In [None]:
plt.hist(data=df[df['IDCyc']==334], x='order_per_cycle_when_ordering', range = [0,500],bins=20);

In [None]:
plt.hist(data=df[df['IDCyc']==334], x='order_per_cycle', range = [0,250],bins=20);

In [None]:
#checking correlations between each variable and the most recent ordering period.
Series_2=df[df['IDCyc']==334].corr()['SaleNom']
print(Series_2)

In [None]:
df2 = pd.DataFrame(Series_2)
df2.head(60)

In [None]:
#graphing the correlations
fig, ax = plt.subplots(figsize=(8, 6));
sns.scatterplot(data=df2);
plt.xlabel('Feature');
plt.ylabel('Correlations');
plt.xticks(rotation=90);

In [None]:
#NumLogins is just the total logins a customer has made--it doesn't accumulate, so we can't use it in our modelling.
df.drop(['NumLogins'], axis=1, inplace=True)

In [None]:
df.reset_index(drop=True)

In [None]:
df.head(-10)

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
df[(df['Cumulative_Sum']==0) & (df['IDCyc']==334)]

In [None]:
df.columns

In [None]:
df.drop(['Addr1', 'Addr2','City', 'St', 'Zip','CDRegMemb','full_address'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
filepath = '/Users/emilydanielbowser/Documents/Iowa Food Coop/Data/Intermediate Data/EDA'
df.to_csv(filepath)