In [1]:
import pandas as pd
import numpy as np

# Step 0: Define the acceptance criteria and import data

First, we will define the acceptance criteria and import the data. This model, will accomplish the following goal: 

We will be able to derive the RK diagrams of various purchasers. This will be mapped to a unified space, which more similar purchaser topologies are closer together and different topolologies are farther apart. 

In [2]:
orders = pd.read_excel("data/store_data.xls", sheet_name="Orders")
returns = pd.read_excel("data/store_data.xls", sheet_name="Returns")
people = pd.read_excel("data/store_data.xls", sheet_name="People")

# Step 1: Choose a lens

We will be starting with the lens of looking at topology wrt to the people dataset first. Each person will have some metrics associated with them, which will form a topology

In [3]:
df = pd.DataFrame(orders["Customer Name"].unique()).T
display(df)
print("There are {} People in the orders".format(len(df.columns)))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,783,784,785,786,787,788,789,790,791,792
0,Claire Gute,Darrin Van Huff,Sean O'Donnell,Brosina Hoffman,Andrew Allen,Irene Maddox,Harold Pawlan,Pete Kriz,Alejandro Grove,Zuschuss Donatelli,...,Fred Wasserman,Lindsay Castell,Harold Engle,Brendan Dodson,Harold Dahlen,Carl Jackson,Roy Skaria,Sung Chung,Ricardo Emerson,Susan MacKendrick


There are 793 People in the orders


# Step 2: Preprocess and Feature Extract

For each person, we will derive measures for extraction. This is done here.

In [4]:
all_orders = orders.join(returns.set_index('Order ID'), on='Order ID', how='left') # merge all the returns together

In [5]:
import scipy.stats
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer

def encode_and_concat(df, column):
    enc = OneHotEncoder()
    v = enc.fit_transform(all_orders.fillna("No")[[column]]).toarray()
    cols = enc.get_feature_names()
    ret = pd.DataFrame(v, columns=cols)
    ret = pd.concat([df, ret], axis=1)
    return ret
    
def extract_features(all_orders):
    data = pd.DataFrame()
    all_orders["ReturnedBin"] = LabelBinarizer().fit_transform(all_orders.fillna("No")["Returned"])
    all_orders["EncodedCountry"] = OneHotEncoder().fit_transform(all_orders.fillna("No")[["Country"]]).toarray()
    all_orders = encode_and_concat(all_orders, "Category")
    all_orders = encode_and_concat(all_orders, "Sub-Category")
    all_orders = encode_and_concat(all_orders, "Segment")
    all_orders = encode_and_concat(all_orders, "Ship Mode")
    for i, k in all_orders.groupby('Customer Name'):
        d = k.describe()
        labels = []
        for k in d.index:
            for j in d.columns.get_level_values(0):
                labels.append("{}-{}".format(k,j))
        metrics = d.to_numpy().flatten()
        metrics = pd.DataFrame(metrics)
        metrics.index = labels
        metrics.columns = [i]
        data = pd.concat([data, metrics], axis=1)
    return data.T

In [6]:
data = extract_features(all_orders)
print("Shape of the data after feature extraction {}".format(data.shape))

Shape of the data after feature extraction (793, 280)


In [8]:
def feature_selection(data, thresh=0.95):
    cmat = data.corr().abs()
    upper_tri = cmat.where(np.triu(np.ones(cmat.shape),k=1).astype(np.bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] >thresh)]
    data = data.drop(columns=to_drop)
    print("Dropping {} columns".format(len(to_drop)))
    return data

data = feature_selection(data)
print("Shape of the data after feature selection {}".format(data.shape))
data.to_csv("data/transformed.csv")

Dropping 0 columns
Shape of the data after feature selection (793, 210)


# Step 3: Define Hierarichal Embedding

In this step, we geenrate a Hierarchical Emmbedding of the sales data. This eventually creates a Hierarchical Graph, which can be used to generate the RK Diagram. The HFE is dervied through ontology.

In [226]:
independent_variables = []
dependent_variables = []

Unnamed: 0,count-Row ID,mean-Row ID,mean-Postal Code,mean-Sales,mean-Quantity,mean-Discount,mean-Profit,mean-ReturnedBin,mean-EncodedCountry,mean-x0_Furniture,...,max-x0_Art,max-x0_Binders,max-x0_Chairs,max-x0_Furnishings,max-x0_Paper,max-x0_Phones,max-x0_Storage,max-x0_First Class,max-x0_Second Class,max-x0_Standard Class
Aaron Bergman,6.0,6786.166667,86094.333333,147.692667,2.166667,0.066667,21.557750,0.000000,1.0,0.333333,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
Aaron Hawkins,11.0,6786.454545,51402.545455,158.609091,4.909091,0.090909,33.201382,0.000000,1.0,0.181818,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Aaron Smayling,10.0,3533.400000,55831.900000,305.069200,4.800000,0.355000,-25.357460,0.000000,1.0,0.200000,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
Adam Bellavance,18.0,4229.111111,42354.000000,430.867778,3.111111,0.044444,114.143806,0.000000,1.0,0.166667,...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
Adam Hart,20.0,6847.300000,40937.150000,162.516850,3.750000,0.135000,14.059450,0.000000,1.0,0.300000,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Xylona Preis,28.0,5588.142857,63617.071429,84.809214,3.571429,0.046429,22.186786,0.107143,1.0,0.142857,...,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
Yana Sorensen,12.0,5854.666667,67699.500000,560.037000,4.833333,0.050000,148.191025,0.000000,1.0,0.250000,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
Yoseph Carroll,8.0,3604.875000,57716.750000,681.793750,3.875000,0.075000,163.203625,0.125000,1.0,0.125000,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
Zuschuss Carroll,31.0,2981.580645,74785.645161,258.893774,3.387097,0.254839,-33.295129,0.129032,1.0,0.258065,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [227]:
data.columns.T

Index(['count-Row ID', 'mean-Row ID', 'mean-Postal Code', 'mean-Sales',
       'mean-Quantity', 'mean-Discount', 'mean-Profit', 'mean-ReturnedBin',
       'mean-EncodedCountry', 'mean-x0_Furniture',
       ...
       'max-x0_Art', 'max-x0_Binders', 'max-x0_Chairs', 'max-x0_Furnishings',
       'max-x0_Paper', 'max-x0_Phones', 'max-x0_Storage', 'max-x0_First Class',
       'max-x0_Second Class', 'max-x0_Standard Class'],
      dtype='object', length=210)

# Build RK Model

With the heirarchy, we now build filters and links. Filters, represent a criterian algorihm which will remove nodes from the RK Model, and links define an edge between nodes.

### Defining the filter

Need some math here for the filter

### Defining the links

Maybe euclidean distance for the links?

In [231]:
# Buildling the RK Diagram here with the defined functions
'''
from rk_diagram.models import RKPipeline, LocalizationAlgorithm, TransformNode
from rk_diagram.visualize import RKModelVisualizer
from rk_diagram.models.graph import EdgeType, Edge

example_pipeline = RKPipeline(preprocess_nodes=[MinMaxNormalizerNode()],
                              localization_algorithm=MaxLocalizer(),
                              hierarchical_embedding_nodes= [ {
                                          "HFeatureExtractor1": HierarchicalFeatureExtractor1()
                                      }
                             },
                             filter_functions=[
                             {
                              "HFeatureExtractor1" :
                                          {
                                              'range_measure': StaticFilter(min=.2, max=.8),
                                              'max_measure': StaticFilter(min=0, max=1)
                                          }
                                       }
                                  ], # question: how to define which limits for which measure. Each filter and linkage has to be BY CLUSTER
                                  linkage_function=SimpleLinkage(threshold=.8))
)
'''
pass

# Show RK Diagram 

Display an untuned RK diagram here across multiple purchasers

# Tune RK Filters and Links

This incorporates a feedback loop to maximize divergence across different RK Diagrams. I need a distance function here for RK models.

### Define Distance Function Between RK Topologies

# Display Optimized RK Digrams