In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important;}</style>"))

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcdefaults()

import sys
sys.path.insert(0, '../')

In [2]:
from src.Ptype import Ptype
import pandas as pd
import numpy as np

  from collections import Sequence


In [3]:
def evaluate_types(dataset_name, ptype):
    predicted_types = ptype.predicted_types
    dataset_path = dataset_path = '../data/' + dataset_name + '.csv'
    df = pd.read_csv(dataset_path, sep=',', encoding='ISO-8859-1', dtype=str, header=None, keep_default_na=False, skipinitialspace=True)    
    
    annotation_path = '../annotations/' + dataset_name + '.csv'
    annotations = pd.read_csv(annotation_path, sep=',', encoding='ISO-8859-1', keep_default_na=False)
    true_values = annotations['Type'].values.tolist()
    true_values = [true_value.split('-')[0] for true_value in true_values]
    
    predictions = predicted_types.values()
    predictions = [prediction.replace('date-eu', 'date').replace('date-iso-8601', 'date').replace('date-non-std-subtype','date').replace('date-non-std','date') for prediction in predictions]

    column_names = list(predicted_types.keys())
    
    correct_, false_ = 0., 0.
    for i, (prediction, true_value) in enumerate(zip(predictions, true_values)):
        column_name = column_names[i]
        unique_vals, unique_vals_counts = np.unique([str(int_element) for int_element in df[df.columns[i]].tolist()], return_counts=True)
        if (prediction == true_value) or (('positive' in prediction) and (true_value in prediction)):
            correct_ += 1
        else:
            false_ += 1
            print('column name : ', column_names[i])
            indices = ptype.normal_types[column_name]
            print('\tsome normal data values: ', [unique_vals[ind] for ind in indices][:20])
            print('\ttheir counts: ', [unique_vals_counts[ind] for ind in indices][:20])
            
            indices = ptype.missing_types[column_name]
            if len(indices) !=0 :
                print('\tsome missing data values: ', [unique_vals[ind] for ind in indices][:20])
                print('\ttheir counts: ', [unique_vals_counts[ind] for ind in indices][:20])
            
            indices = ptype.anomaly_types[column_name]
            if len(indices) !=0 :                
                print('\tsome anomalous data values: ', [unique_vals[ind] for ind in indices][:20])
                print('\ttheir counts: ', [unique_vals_counts[ind] for ind in indices][:20])
            
            print('\ttrue/annotated type : ', true_value, '\n\tpredicted type : ', prediction)            
            print('\tposterior probs: ', ptype.p_t_columns[list(ptype.p_t_columns.keys())[i]])
            print('\ttypes: ', list(ptype.types.values()), '\n')
            

    print('correct/total = ', round(correct_/len(column_names),2), '(' + str(int(correct_)) + '/' + str(len(column_names)) + ')')

### creating ptype assistant

In [4]:
ptype = Ptype() 

### loading data

In [5]:
dataset_name = 'auto'
dataset_path = '../data/' + dataset_name + '.csv'

df =  pd.read_csv(dataset_path, sep=',', encoding='ISO-8859-1', dtype=str, header=None, keep_default_na=False, skipinitialspace=True)
print(df.shape)
df.head(5)

(205, 26)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


### running ptype

In [6]:
ptype.set_data(_data_frame=df, _dataset_name=dataset_name)
ptype.run_all_columns()
# ptype.predicted_types

### reporting the results

In [7]:
ptype.show_results()

col: 0
	predicted type: integer
	posterior probs:  [9.99999674e-01 0.00000000e+00 3.26244845e-07 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['-2', '0', '1', '2', '3']
	their counts:  [3, 67, 54, 32, 27]
	percentage of normal: 0.89 

	missing values: ['-1']
	their counts:  [22]
	percentage of missing: 0.11 

col: 1
	predicted type: integer
	posterior probs:  [1.00000000e+00 0.00000000e+00 4.73609772e-47 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['101', '102', '103', '104', '106', '107', '108', '110', '113', '115', '118', '119', '121', '122', '125', '128', '129', '134', '137', '

### evaluating the results

In [8]:
evaluate_types(dataset_name, ptype)

correct/total =  1.0 (26/26)


### get the columns with missing data

In [9]:
column_names = ptype.get_columns_with_missing()
ptype.show_results(column_names)

# columns with missing data: 8 

col: 0
	predicted type: integer
	posterior probs:  [9.99999674e-01 0.00000000e+00 3.26244845e-07 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['-2', '0', '1', '2', '3']
	their counts:  [3, 67, 54, 32, 27]
	percentage of normal: 0.89 

	missing values: ['-1']
	their counts:  [22]
	percentage of missing: 0.11 

col: 1
	predicted type: integer
	posterior probs:  [1.00000000e+00 0.00000000e+00 4.73609772e-47 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
	types:  ['integer', 'string', 'float', 'boolean', 'gender', 'date-iso-8601', 'date-eu', 'date-non-std-subtype', 'date-non-std'] 

	some normal data values:  ['101', '102', '103', '104', '106', '107', '108', '110', '113', '115', '118', '119', '121', '122', '12

### get the columns with anomalies

In [10]:
column_names = ptype.get_columns_with_anomalies()
ptype.show_results(column_names)

# columns with anomalies: 0 

