In [1]:
import pandas as pd

In [2]:
og_df = pd.read_csv('../data/retail_data.csv')

og_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


### Minimum restrictions

In [9]:
# ignoring entries with quantity or unitprice less than 0 (returns)
df_minimums = og_df[(og_df['Quantity'] > 0) & (og_df['UnitPrice'] > 0)].copy()

print('Minimun Quantity in dataframe: ', df_minimums.Quantity.min())
print('Minimun UnitPrice in dataframe: ', df_minimums.UnitPrice.min())

Minimun Quantity in dataframe:  1
Minimun UnitPrice in dataframe:  0.001


In [10]:
df_minimums.to_csv('../data/retail_0minimums.csv')

### Totals

In [25]:
# grouping CustomerID, Country, InvoiceDate
# summing UnitPrice to get TotalPrice, summing quantity
# ignoring entries with quantity or unitprice less than 0 (returns)
df_totals = df_minimums
df_totals['TotalPrice'] = df_minimums['Quantity'] * df_minimums['UnitPrice']
df_totals = df_totals.groupby(['CustomerID', 'InvoiceDate', 'Country']).agg({'Quantity': 'sum', 'StockCode': 'nunique', 'TotalPrice': 'sum'}).reset_index()
df_totals.rename(columns={'Quantity': 'TotalQuantity', 'StockCode': 'UniqueStockCodes'}, inplace=True)

df_totals.head()

Unnamed: 0,CustomerID,InvoiceDate,Country,TotalQuantity,UniqueStockCodes,TotalPrice
0,12346.0,1/18/2011 10:01,United Kingdom,74215,1,77183.6
1,12347.0,1/26/2011 14:30,Iceland,315,29,475.39
2,12347.0,10/31/2011 12:25,Iceland,676,47,1294.32
3,12347.0,12/7/2010 14:57,Iceland,319,31,711.79
4,12347.0,12/7/2011 15:52,Iceland,192,11,224.82


In [26]:
df_totals.to_csv('../data/retail_totals.csv')

### Removing Outliers

In [None]:
import numpy as np

# Define a function to identify outliers using the IQR method
def identify_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (series < lower_bound) | (series > upper_bound)

# Exclude outliers from the DataFrame
outliers_mask = identify_outliers(df['TotalPrice'])

# Filter the DataFrame to exclude outliers
df_filtered = df[~outliers_mask]

# Now, proceed with grouping and aggregating as before
df_customer_purchase = df_filtered.groupby(['CustomerID', 'InvoiceDate', 'Country']).agg({'Quantity': 'sum', 'StockCode': 'nunique', 'TotalPrice': 'sum'}).reset_index()
df_customer_purchase.rename(columns={'Quantity': 'TotalQuantity', 'StockCode': 'UniqueStockCodes'}, inplace=True)

# Display the first few rows of the grouped DataFrame
print(df_customer_purchase.head())