In this demo, we demonstrate how ptype can be used. The tasks are as follows:

- to run ptype on a data frame, and print a summary of the results. 
- to show possible interactions ptype offers to its users, when a change on the predictions is necessary.

### imports

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important;}</style>"))

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcdefaults()

import sys
sys.path.insert(0, '../')

from src.Ptype import Ptype
from src.utils import evaluate_types
import pandas as pd
import numpy as np

  from collections import Sequence


## 1 Using ptype
### 1.a Create a ptype assistant

In [2]:
ptype = Ptype()

### loading data

In [3]:
dataset_name = 'auto'
dataset_path = '../data/' + dataset_name + '.csv'

# header=None, 
df =  pd.read_csv(dataset_path, sep=',', encoding='ISO-8859-1', dtype=str, keep_default_na=False, skipinitialspace=True)
print(df.shape)
df.head(5)

(204, 26)


Unnamed: 0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,...,130,mpfi,3.47,2.68,9.00,111,5000,21,27,13495
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250


### 1.b Run ptype

In [4]:
ptype.run_inference(_data_frame=df)

### 1.c Report the results

In [5]:
evaluate_types(dataset_name, ptype)

correct/total =  1.0 (26/26)


#### Show the results for all of the columns

In [12]:
ptype.show_results()

col: 3
	predicted type: integer
	posterior probs:  [9.99999647e-01 0.00000000e+00 3.53431906e-07 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['-2', '0', '1', '2', '3']
	their counts:  [3, 67, 54, 32, 26]
	fraction of normal: 0.89 

	missing values: ['-1']
	their counts:  [22]
	fraction of missing: 0.11 

col: ?
	predicted type: integer
	posterior probs:  [1.00000000e+00 0.00000000e+00 4.73609772e-47 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['101', '102', '103', '104', '106', '107', '108', '110', '113', '115', '118', '119', '121', '122', '125', '128', '129', '134', '137', '142'

#### Show the results for the columns with missing data

In [13]:
column_names = ptype.get_columns_with_missing()
ptype.show_results(column_names)

# columns with missing data: 8 

col: 3
	predicted type: integer
	posterior probs:  [9.99999647e-01 0.00000000e+00 3.53431906e-07 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['-2', '0', '1', '2', '3']
	their counts:  [3, 67, 54, 32, 26]
	fraction of normal: 0.89 

	missing values: ['-1']
	their counts:  [22]
	fraction of missing: 0.11 

col: ?
	predicted type: integer
	posterior probs:  [1.00000000e+00 0.00000000e+00 4.73609772e-47 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['101', '102', '103', '104', '106', '107', '108', '110', '113', '115', '118', '119', '121', '122', '125', 

#### Show the results for the columns with anomalies

In [14]:
column_names = ptype.get_columns_with_anomalies()
ptype.show_results(column_names)

# columns with anomalies: 0 



## 2. User Interactions
- changing the column type predictions,
- changing the anomaly type predictions,
- changing the missing type predictions.


### 2.a Change the column type predictions

In [6]:
dataset_name = 'data_gov_10151_1'
dataset_path = '../data/' + dataset_name + '.csv'
df =  pd.read_csv(dataset_path, sep=',', encoding='ISO-8859-1', dtype=str, keep_default_na=False, skipinitialspace=True)
print(df.shape)
df.head(2)

(99, 21)


Unnamed: 0,OBJECTID,Loc_name,Status,Score,Match_type,Match_addr,Side,Ref_ID,X,Y,...,Addr_type,ARC_Street,ARC_City,ARC_State,ARC_ZIP,Name,Municipali,Address,Municipa_1,ZipCodes
0,1,DW_Addressing_,M,95.42,A,"1710 PACIFIC AVE, HARRISON, PA, 15065",,3150892,1420542.434568,475246.110473,...,StreetAddress,1710 Pacific Avenue,Natrona Heights,,15065,Community Market,Natrona Heights,1710 Pacific Avenue,Natrona Heights,15065
1,2,DW_Addressing_,M,93.91,A,"1117 MILLTOWN RD, PENN HILLS, PA, 15147",,3148048,1401019.429449,431068.460889,...,StreetAddress,1117 Mill Town Road,Verona,,15147,Community Market,Verona,1117 Mill Town Road,Verona,15147


In [7]:
ptype.run_inference(_data_frame=df)

#### checking columns annotated with the gender type

In [8]:
gender_columns = ptype.get_columns_with_type('gender')
ptype.show_results(gender_columns)

col: Status
	predicted type: gender
	posterior probs:  [0. 0. 0. 0. 1. 0. 0. 0. 0.]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['M']
	their counts:  [86]
	fraction of normal: 0.87 

	anomalies: ['T', 'U']
	their counts: [5, 8]
	fraction of anomalies: 0.13 



In [9]:
ptype.change_column_types(gender_columns, ['string',])
ptype.show_results(gender_columns)

The column type of Status is changed from gender to string
col: Status
	predicted type: string
	posterior probs:  [0. 0. 0. 0. 1. 0. 0. 0. 0.]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['M']
	their counts:  [86]
	fraction of normal: 0.87 

	anomalies: ['T', 'U']
	their counts: [5, 8]
	fraction of anomalies: 0.13 



### 2.b Changing the anomaly annotations
Note that the values of 'T' and 'U' are still annotated as anomalies. We need to update the predictions to fix this.

In [7]:
ptype.change_anomalies('Status', ['T', 'U'])
ptype.show_results(gender_columns)

col: Status
	predicted type: string
	posterior probs:  [0. 0. 0. 0. 1. 0. 0. 0. 0.]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['M', 'T', 'U']
	their counts:  [86, 5, 8]
	fraction of normal: 1.0 



### 2.c Change missing data encodings

In [17]:
dataset_name = 'auto'
dataset_path = '../data/' + dataset_name + '.csv'
df =  pd.read_csv(dataset_path, sep=',', encoding='ISO-8859-1', dtype=str, header=None, keep_default_na=False, skipinitialspace=True)
print(df.shape)
df.head(2)

(205, 26)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500


In [18]:
ptype.run_inference(_data_frame=df)

#### checking the columns with missing data

In [19]:
column_names = ptype.get_columns_with_missing()
ptype.show_results(column_names)

# columns with missing data: 8 

col: 0
	predicted type: integer
	posterior probs:  [9.99999674e-01 0.00000000e+00 3.26244845e-07 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['-2', '0', '1', '2', '3']
	their counts:  [3, 67, 54, 32, 27]
	fraction of normal: 0.89 

	missing values: ['-1']
	their counts:  [22]
	fraction of missing: 0.11 

col: 1
	predicted type: integer
	posterior probs:  [1.00000000e+00 0.00000000e+00 4.73609772e-47 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['101', '102', '103', '104', '106', '107', '108', '110', '113', '115', '118', '119', '121', '122', '125', 

In [20]:
column_name = '0'
ptype.change_missing_data(column_name, ['-1'])
ptype.show_results(column_name)

col: 0
	predicted type: integer
	posterior probs:  [9.99999674e-01 0.00000000e+00 3.26244845e-07 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['-1', '-2', '0', '1', '2', '3']
	their counts:  [22, 3, 67, 54, 32, 27]
	fraction of normal: 1.0 

