In [2]:
%matplotlib inline
%load_ext autoreload

%autoreload 2

import pandas as pd
import numpy as np
import pyDOE


# number of simulations we can afford

In [3]:
n_sample=100

# Simple case

In [4]:
## dimensions we want to cover

In [5]:
doe_params  = pd.read_excel("doe_parameters.xlsx", sheet_name="simple_case")
doe_params

Unnamed: 0,demand,fiscal_regime,fiscal_pressure
0,low_1.5,a,high_taxes
1,med_2,b,low_taxes
2,high_3,,


In [6]:
n_dim = doe_params.shape[1]
n_dim

3

## count number of possibilities per paramter

In [6]:
## hack to remove nans (not used)
doe_params_dict = dict((k, v.dropna().to_dict()) for k, v in pd.compat.iteritems(doe_params))
doe_params_dict

{'demand': {0: 'low_1.5', 1: 'med_2', 2: 'high_3'},
 'fiscal_regime': {0: 'a', 1: 'b'},
 'fiscal_pressure': {0: 'high_taxes', 1: 'low_taxes'}}

In [7]:
possibilities_per_dim = doe_params.apply(lambda col:len(col.dropna().unique()))
possibilities_per_dim

demand             3
fiscal_regime      2
fiscal_pressure    2
dtype: int64

## generate normalized DOE

In [8]:
norm_doe = pd.DataFrame ( pyDOE.lhs( n_dim , samples= n_sample, criterion = "corr" , iterations=10000), columns= doe_params.columns )
norm_doe.head(10)

new candidate solution found with max,abs corrcoef = 0.9999999753722308


Unnamed: 0,demand,fiscal_regime,fiscal_pressure
0,0.41929,0.353127,0.944905
1,0.457847,0.039758,0.099585
2,0.966099,0.50619,0.290736
3,0.392735,0.988729,0.913856
4,0.388577,0.63994,0.762169
5,0.834199,0.539405,0.11886
6,0.156826,0.775701,0.815746
7,0.692198,0.87142,0.127967
8,0.903412,0.227079,0.133102
9,0.556808,0.676477,0.740663


The output of DOE is a matrix with n_dim columns in [0,1] (the values of the dimensions) and n_sample lines. Each line is a simulation, each colmun is the value to be simulated

In [9]:
norm_doe.shape

(100, 3)

In [10]:
#check that the outputs are not correlated
norm_doe.corr()

Unnamed: 0,demand,fiscal_regime,fiscal_pressure
demand,1.0,0.141176,-0.085587
fiscal_regime,0.141176,1.0,0.02297
fiscal_pressure,-0.085587,0.02297,1.0


## OUTPUT : cut the normalized doe in discrete choices

In [15]:
df=pd.DataFrame()
for c in norm_doe:
    df[c] = pd.cut(norm_doe[c], bins = possibilities_per_dim[c],  labels=doe_params[c].dropna())
    
df.sample(10)    

Unnamed: 0,demand,fiscal_regime,fiscal_pressure
98,high_3,b,high_taxes
76,high_3,a,high_taxes
9,med_2,b,low_taxes
81,high_3,a,high_taxes
11,high_3,b,high_taxes
62,high_3,a,high_taxes
5,high_3,b,high_taxes
4,med_2,b,low_taxes
52,med_2,a,high_taxes
8,high_3,a,high_taxes


# function for simple case

In [7]:
from generate_doe import generate_doe
    



In [8]:
df = generate_doe(n_sample,doe_params, verbose=True)

hi
ndim=3
possibilities_per_dim demand             3
fiscal_regime      2
fiscal_pressure    2
dtype: int64
new candidate solution found with max,abs corrcoef = 0.9999997234913477
correlation                    demand  fiscal_regime  fiscal_pressure
demand           1.000000      -0.022413         0.192779
fiscal_regime   -0.022413       1.000000         0.120851
fiscal_pressure  0.192779       0.120851         1.000000


## double check output

In [9]:
df.describe()

Unnamed: 0,demand,fiscal_regime,fiscal_pressure
count,100,100,100
unique,3,2,2
top,low_1.5,b,low_taxes
freq,34,50,50


In [10]:
df.apply(pd.value_counts).sum(axis=1).sort_values()

high_3        33.0
med_2         33.0
low_1.5       34.0
a             50.0
b             50.0
high_taxes    50.0
low_taxes     50.0
dtype: float64

This above shows that the values with 3 possibilities appeared 33% times and the values with 2 appeared 50 %times

In [11]:
df.drop_duplicates()

Unnamed: 0,demand,fiscal_regime,fiscal_pressure
0,high_3,b,low_taxes
1,med_2,a,low_taxes
2,low_1.5,b,high_taxes
3,med_2,b,low_taxes
4,low_1.5,a,high_taxes
5,low_1.5,b,low_taxes
6,high_3,a,low_taxes
7,high_3,a,high_taxes
10,med_2,a,high_taxes
12,med_2,b,high_taxes


# Combining LHS dimensions in main dimensions