In [42]:
import pandas as pd
import numpy as np
import time
from datetime import datetime

from random import seed
from random import randint
from sklearn.cluster import KMeans

import seaborn as sns
import matplotlib.pyplot as plt

In [48]:
df = pd.read_csv('../retail_data/data.csv', encoding = "ISO-8859-1")
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['monetary_value'] = df['Quantity'] * df['UnitPrice']

df.rename(columns = {'Quantity':'frequency'}, inplace = True)

Just by looking at this dataframe I'm thinking of different tables. 
- Product table: StockCode, Description, productId PK, UnitPrice
- Invoice table: InvoiceNo, InvoiceDate, monetary_value 
- Orders table: InvoiceNo PK, productId, frequency, CustomerID, CountryID
- time table: InvoiceDate, minutes, hours, day, week, month, quarter, year, dayofweek.
- Location: Country, CountryID PK
- Customers: CustomerID PK , CountryID


I'll do a little deep dive on the customers to know how many things we can extract from them just as if we were a real company

### Exploring customers with RFM

In [49]:
customer_panel = df.groupby('CustomerID').agg({'InvoiceDate': max,
                                               'monetary_value': sum,
                                               'frequency': sum}).reset_index()

customer_panel['recency'] = ((customer_panel['InvoiceDate'].max() - customer_panel['InvoiceDate'])
                             / np.timedelta64(1, 'D')).round(0)

customer_panel.drop(columns = ['InvoiceDate'], inplace = True)

In [50]:
def order_cluster(cluster_field_name, target_field_name, df, ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

In [51]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(customer_panel[['recency']])
customer_panel['RecencyCluster'] = kmeans.predict(customer_panel[['recency']])

In [52]:
customer_panel = order_cluster('RecencyCluster', 'recency', customer_panel, False)

In [53]:
customer_panel.groupby('RecencyCluster')['recency'].mean()

RecencyCluster
0    309.381423
1    190.251603
2     79.707834
3     17.916551
Name: recency, dtype: float64

In [54]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(customer_panel[['frequency']])
customer_panel['FrequencyCluster'] = kmeans.predict(customer_panel[['frequency']])

In [55]:
customer_panel = order_cluster('FrequencyCluster', 'frequency', customer_panel, True)

In [58]:
customer_panel.groupby('FrequencyCluster')['frequency'].mean()

FrequencyCluster
0       817.166513
1     22696.576923
2     64091.555556
3    196719.000000
Name: frequency, dtype: float64

In [59]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(customer_panel[['monetary_value']])
customer_panel['MonetaryCluster'] = kmeans.predict(customer_panel[['monetary_value']])

In [60]:
customer_panel = order_cluster('MonetaryCluster', 'monetary_value', customer_panel, True)

In [61]:
customer_panel.groupby('MonetaryCluster')['monetary_value'].mean()

MonetaryCluster
0      1150.834871
1     14989.430833
2     71423.516000
3    241136.560000
Name: monetary_value, dtype: float64

In [65]:
customer_panel['RFMScore'] = customer_panel['RecencyCluster'] + customer_panel['FrequencyCluster'] + customer_panel['MonetaryCluster']

In [77]:
customer_panel.groupby('RFMScore').agg({'recency':np.mean,
                                        'frequency':np.mean,
                                        'monetary_value':np.mean,
                                        'CustomerID':'count'})

Unnamed: 0_level_0,recency,frequency,monetary_value,CustomerID
RFMScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,309.381423,192.144269,359.771245,506
1,190.180064,323.369775,539.891063,622
2,79.946396,606.779113,967.916499,1082
3,18.6634,1006.726605,1663.241843,2041
4,8.494253,6476.195402,12437.214368,87
5,6.3125,21680.4375,28965.4375,16
6,8.125,26675.5,54267.19125,8
7,6.571429,63381.857143,91030.744286,7
8,4.0,66575.5,221960.33,2
9,1.0,196719.0,279489.02,1
