In this notebook, we present various usecases to interact with ptype to handle missing type predictions.

In [None]:
# Preamble to run notebook in context of source package.
# NBVAL_IGNORE_OUTPUT
import sys
sys.path.insert(0, '../')
!{sys.executable} -m pip install -r ../requirements.txt


In [None]:
from IPython.core.display import display, HTML

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcdefaults()

from ptype.Column import Column2ARFF
from ptype.Ptype import Ptype
from ptype.utils import evaluate_types
import pandas as pd
import numpy as np
from utils import *

In [None]:
ptype = Ptype()

# 1. Incorrect Missing Data Prediction

In [None]:
df = read_data(dataset_name='auto')
column = 0
df_subsample = subsample_df(df, column_to_sample_from = column, sample_num = 10)
df_subsample

In [None]:
ptype.fit_schema(_data_frame=df_subsample)

plot_column_type_posterior(p_t=ptype.cols[column].p_t, 
                           types=ptype.types.items())

# features = ptype.features[column]
# arff_type, arff_post = column2ARFF.get_arff(features)

# plot_arff_type_posterior(arff_post)

plot_row_type_posterior(ptype.cols[column], t='missing')

In [None]:
ptype.cols[column].reclassify_normal(['-1'])

plot_row_type_posterior(ptype.cols[column], t='missing')

# 2. Multiple Missing Data Encodings

In [None]:
df = read_data("mass_6", header=0)

column = "LRE Ages 3-5 - Full Incl #"
df_subsample = subsample_df(df, column_to_sample_from=column, sample_num=20)
display(df_subsample)

unique_values, counts = np.unique(
    [str(int_element) for int_element in df_subsample[column].tolist()],
    return_counts=True,
)
plot_bar(
    unique_values,
    counts,
    title="counts of the unique data values",
    y_lim_max=None,
    xlabel="Unique Value",
    ylabel="Counts",
)

In [None]:
ptype.fit_schema(_data_frame=df_subsample)
# features = ptype.features[column]
# arff_type, arff_post = column2ARFF.get_arff(features)

plot_column_type_posterior(p_t=ptype.cols[column].p_t, 
                           types=ptype.types.items())

# plot_arff_type_posterior(arff_post)

plot_row_type_posterior(ptype.cols[column], t='missing')

In [None]:
new_encoding = 'NA'
ptype.replace_missing(column, new_encoding)

unique_values, counts = np.unique(
    [str(int_element) for int_element in ptype.model.data[column].tolist()],
    return_counts=True,
)
plot_row_type_posterior(ptype.cols[column], t='missing')