# Incorrect Column Type Prediction

In [1]:
# Preamble to run notebook in context of source package.
# NBVAL_IGNORE_OUTPUT
import sys
sys.path.insert(0, '../')

In [2]:
from IPython.core.display import display
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcdefaults()
import numpy as np
import pandas as pd

from ptype.Ptype import Ptype
from utils import plot_column_type_posterior, plot_arff_type_posterior, subsample_df

### Toy Example

In [3]:
x = ['1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '90']
column = 'year'

df = pd.DataFrame(x, dtype='str', columns=[column])
df

Unnamed: 0,year
0,1991
1,1992
2,1993
3,1994
4,1995
5,1996
6,1997
7,1998
8,90


In [4]:
ptype = Ptype()

ptype.fit_schema(df)
ptype.show_schema()

Unnamed: 0,year
type,integer
normal values,"[1991, 1992, 1993, 1994, 1995, 1996, 1997, 199..."
ratio of normal values,1
missing values,[]
ratio of missing values,0
anomalous values,[]
ratio of anomalous values,0


In [5]:
ptype.cols[column]

{'type': 'integer', 'dtype': 'Int64', 'arff_type': 'numeric', 'normal_values': ['1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '90'], 'missing_values': [], 'missingness_ratio': 0.0, 'anomalies': [], 'anomalous_ratio': 0.0, 'categorical_values': None}

In [6]:
ptype.reclassify_column(column, 'date-iso-8601')

In [7]:
ptype.show_schema()

Unnamed: 0,year
type,date-iso-8601
normal values,"[1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998]"
ratio of normal values,0.89
missing values,[]
ratio of missing values,0
anomalous values,[90]
ratio of anomalous values,0.11


In [8]:
ptype.cols[column]

{'type': 'date-iso-8601', 'dtype': 'datetime64', 'arff_type': 'numeric', 'normal_values': ['1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998'], 'missing_values': [], 'missingness_ratio': 0.0, 'anomalies': ['90'], 'anomalous_ratio': 0.11, 'categorical_values': None}

### Real-world Example

In [9]:
df = pd.read_csv('../data/grub-damage.csv', encoding="ISO-8859-1",dtype='str')
df.head()

Unnamed: 0,year_zone,year,strip,pdk,damage_rankRJT,damage_rankALL,dry_or_irr,zone,GG_new
0,6f,86,3,1,1,0,D,F,low
1,6f,86,3,2,0,0,D,F,high
2,6f,86,3,3,1,1,D,F,high
3,6f,86,3,4,1,0,D,F,high
4,6f,86,3,5,0,0,D,F,low


In [10]:
ptype = Ptype()

ptype.fit_schema(df)
ptype.show_schema()

Unnamed: 0,year_zone,year,strip,pdk,damage_rankRJT,damage_rankALL,dry_or_irr,zone,GG_new
type,string,integer,integer,integer,integer,integer,string,boolean,string
normal values,"[0c, 0f, 0m, 1c, 1f, 1m, 2c, 2f, 2m, 6c, 6f, 6...","[86, 87, 88, 89, 90, 91, 92]","[1, 10, 2, 3, 4, 5, 6, 7, 9]","[0, 1, 2, 3, 4, 5]","[0, 1, 2, 3, 4, 5]","[0, 1, 2, 3, 4, 5]","[B, D, O]",[F],"[average, high, low, veryhigh]"
ratio of normal values,1,1,1,1,1,1,1,0.46,1
missing values,[],[],[],[],[],[],[],[],[]
ratio of missing values,0,0,0,0,0,0,0,0,0
anomalous values,[],[],[],[],[],[],[],"[C, M]",[]
ratio of anomalous values,0,0,0,0,0,0,0,0.54,0


In [11]:
column = 'zone'
ptype.cols[column]

{'type': 'boolean', 'dtype': 'bool', 'arff_type': 'nominal', 'normal_values': ['F'], 'missing_values': [], 'missingness_ratio': 0.0, 'anomalies': ['C', 'M'], 'anomalous_ratio': 0.54, 'categorical_values': ['F']}

In [12]:
ptype.reclassify_column(column, 'string')

In [13]:
ptype.show_schema()

Unnamed: 0,year_zone,year,strip,pdk,damage_rankRJT,damage_rankALL,dry_or_irr,zone,GG_new
type,string,integer,integer,integer,integer,integer,string,string,string
normal values,"[0c, 0f, 0m, 1c, 1f, 1m, 2c, 2f, 2m, 6c, 6f, 6...","[86, 87, 88, 89, 90, 91, 92]","[1, 10, 2, 3, 4, 5, 6, 7, 9]","[0, 1, 2, 3, 4, 5]","[0, 1, 2, 3, 4, 5]","[0, 1, 2, 3, 4, 5]","[B, D, O]","[C, F, M]","[average, high, low, veryhigh]"
ratio of normal values,1,1,1,1,1,1,1,1,1
missing values,[],[],[],[],[],[],[],[],[]
ratio of missing values,0,0,0,0,0,0,0,0,0
anomalous values,[],[],[],[],[],[],[],[],[]
ratio of anomalous values,0,0,0,0,0,0,0,0,0


In [14]:
ptype.cols[column]

{'type': 'string', 'dtype': 'string', 'arff_type': 'nominal', 'normal_values': ['C', 'F', 'M'], 'missing_values': [], 'missingness_ratio': 0.0, 'anomalies': [], 'anomalous_ratio': 0.0, 'categorical_values': ['F']}

In [15]:
# plot_column_type_posterior(ptype.cols[column].p_t)

In [16]:
# arff_type = ptype.cols[column].arff_type

# plot_arff_type_posterior(ptype.cols[column].arff_posterior)

In [17]:
# ptype.reclassify_column(column, 'string')

# plot_column_type_posterior(ptype.cols[column].p_t)

# # do the same thing for arff type