In [1]:
# Preamble to run notebook in context of source package.
import sys
sys.path.insert(0, '../')

### Toy example

In [2]:
import pandas as pd

types = [
    "integer",
    "string",
    "float",
    "boolean",
    "date-iso-8601",
    "date-eu",
    "date-non-std-subtype",
    "date-non-std",
]

df_trainings, y_trainings = [], []

x = ['aaaa', 'aaaa', 'aaaa', 'aaaa', 'aaaa']
column = 'string'

df_training = pd.DataFrame(x, dtype='str', columns=[column])
y_training = temp = [key + 1 for key, value in enumerate(types) if value == 'integer']

df_trainings.append(df_training)
y_trainings.append(y_training)

df_training.head()

Unnamed: 0,string
0,aaaa
1,aaaa
2,aaaa
3,aaaa
4,aaaa


In [3]:
def print_params(uniformly, initial, final, t):
    print("\nuniformly is", uniformly)
    
    # initial probs should not change
    machine_index = initial.types.index(t) + 2
    
    print("\tInitial I", initial.machines[machine_index].I)
    print("\tFinal I  ", final.machines[machine_index].I, '\n')

    # final probs should change
    print("\tInitial T", initial.machines[machine_index].F)
    print("\tFinal T  ", final.machines[machine_index].F, '\n')

    # transition probs should change
    print("\tInitial F", initial.machines[machine_index].T[1]['a'])
    print("\tFinal F  ", final.machines[machine_index].T[1]['a'], '\n')

make sure that normalization changes the results

In [4]:
# NBVAL_IGNORE_OUTPUT
# to ignore convergence warning

from ptype.Ptype import Ptype
from ptype.Trainer import Trainer

for uniformly in [True, False]:
    ptype = Ptype(_types=types)
    trainer = Trainer(ptype.machines, df_trainings, y_trainings)
    initial, final, training_error = trainer.train(20, uniformly)

    
    print_params(uniformly, initial, final, "string")

[356.08108481216993, 2.0794418555424183]
[356.08108481216993, 2.0794418555424183, 2.0794418554875715]

uniformly is True
	Initial I {0: 0.0, 1: -1e+150}
	Final I   {0: 0.0, 1: -1e+150} 

	Initial T {0: -1e+150, 1: -4.2626798770413155}
	Final T   {0: -1e+150, 1: -90.02593332788025} 

	Initial F {1: -4.2626798770413155}
	Final F   



{1: -4.248495242049359} 

[2.0794418554875715, 2.0794418554327314]
[2.0794418554875715, 2.0794418554327314, 2.0794418553779344]

uniformly is False
	Initial I {0: 0.0, 1: -1e+150}
	Final I   {0: 0.0, 1: -1e+150} 

	Initial T {0: -1e+150, 1: -90.02593332788025}
	Final T   {0: -1e+150, 1: -90.02628276459015} 

	Initial F {1: -4.248495242049359}
	Final F   {1: -4.248495242049359} 



### Real-world Datasets

In [6]:
from tests.test_ptype import get_inputs
dfs, ys = [], []
for dataset_name in ["accident2016", "auto", "data_gov_3397_1"]:
    df, y = get_inputs(dataset_name, annotations_file="../annotations/annotations.json", data_folder="../data/")
    dfs.append(df)
    ys.append(y)


In [7]:
# NBVAL_IGNORE_OUTPUT
# to ignore convergence warning

for uniformly in [True, False]:
    ptype = Ptype(_types=types)
    trainer = Trainer(ptype.machines, df_trainings, y_trainings)
    initial, final, training_error = trainer.train(20, uniformly)
    
    print_params(uniformly, initial, final, "string")

[356.08108481216993, 2.0794418555424183]
[356.08108481216993, 2.0794418555424183, 2.0794418554875715]

uniformly is True
	Initial I {0: 0.0, 1: -1e+150}
	Final I   {0: 0.0, 1: -1e+150} 

	Initial T {0: -1e+150, 1: -4.2626798770413155}
	Final T   {0: -1e+150, 1: -90.02593332788025} 

	Initial F {1: -4.2626798770413155}
	Final F   {1: -4.248495242049359} 





[2.0794418554875715, 2.0794418554327314]
[2.0794418554875715, 2.0794418554327314, 2.0794418553779344]

uniformly is False
	Initial I {0: 0.0, 1: -1e+150}
	Final I   {0: 0.0, 1: -1e+150} 

	Initial T {0: -1e+150, 1: -90.02593332788025}
	Final T   {0: -1e+150, 1: -90.02628276459015} 

	Initial F {1: -4.248495242049359}
	Final F   {1: -4.248495242049359} 

