# Classifying values as anomalous

Sometimes we’re happy with the type that ptype infers for a column, but discover that it has incorrectly treated as legitimate some values which we know to be anomalies. We can remedy this by extending the set of values that ptype treats as anomalies, and then rerunning the analysis. We illustrate this with a simple data analysis task.

In [6]:
# Preamble to run notebook in context of source package.
# NBVAL_IGNORE_OUTPUT
import sys
sys.path.insert(0, '../')

### Mental Health in Tech Survey
This dataset is used to measure attitudes towards mental health and frequency of mental health disorders in the tech workplace. The dataset source is https://www.kaggle.com/osmi/mental-health-in-tech-2016. We reading the dataset in the usual way for working with ptype, with the `dtype='str'` option.

In [7]:
import pandas as pd
from ptype.Ptype import Ptype

df = pd.read_csv("../data/survey.csv", encoding="ISO-8859-1", dtype=str, keep_default_na=False)
df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [9]:
ptype = Ptype()
schema = ptype.schema_fit(df)
schema.show()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
type,date-non-std,integer,string,string,string,boolean,boolean,boolean,string,string,...,string,boolean,boolean,string,string,boolean,boolean,boolean,boolean,string
normal values,"[2014-08-27 11:29:31, 2014-08-27 11:29:37, 201...","[-1, -1726, -29, 11, 18, 19, 20, 21, 22, 23, 2...","[A little about you, Agender, All, Androgyne, ...","[Australia, Austria, Bahamas, The, Belgium, Bo...","[AL, AZ, CA, CO, CT, DC, FL, GA, IA, ID, IL, I...","[No, Yes]","[No, Yes]","[No, Yes]","[Never, Often, Rarely, Sometimes]","[1-5, 100-500, 26-100, 500-1000, 6-25, More th...",...,"[Somewhat difficult, Somewhat easy, Very diffi...","[No, Yes]","[No, Yes]","[No, Some of them, Yes]","[No, Some of them, Yes]","[No, Yes]","[No, Yes]","[No, Yes]","[No, Yes]",[A strong mind goes a long way. Stay strong. T...
missing values,[],[],[],[],[NA],[NA],[],[],[NA],[],...,[],[],[],[],[],[],[],[],[],"[ , -, NA]"
anomalous values,[],[],"[Female (cis), Female (trans), Guy (-ish) ^_^,...",[],[],[],[],[],[],[],...,[Don't know],[Maybe],[Maybe],[],[],[Maybe],[Maybe],[Don't know],[],[(yes but the situation was unusual and involv...


In [10]:
df['Age'].unique()

array(['37', '44', '32', '31', '33', '35', '39', '42', '23', '29', '36',
       '27', '46', '41', '34', '30', '40', '38', '50', '24', '18', '28',
       '26', '22', '19', '25', '45', '21', '-29', '43', '56', '60', '54',
       '329', '55', '99999999999', '48', '20', '57', '58', '47', '62',
       '51', '65', '49', '-1726', '5', '53', '61', '8', '11', '-1', '72'],
      dtype=object)

In [11]:
anomalous_values = ptype.machines.get_anomalous_values()

anomalous_values.extend(['99999999999', '-1', '-1726', '-29'])
ptype.machines.set_anomalous_values(anomalous_values)

schema = ptype.schema_fit(df)
schema.show()
# to-do: should we consider making this column specific rather than a global list
# this again can be done similar to how it is handled in pandas.read_csv which is 
# keep_default_na=False, na_values={'species':['']}

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
type,date-non-std,integer,string,string,string,boolean,boolean,boolean,string,string,...,string,boolean,boolean,string,string,boolean,boolean,boolean,boolean,string
normal values,"[2014-08-27 11:29:31, 2014-08-27 11:29:37, 201...","[11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 2...","[A little about you, Agender, All, Androgyne, ...","[Australia, Austria, Bahamas, The, Belgium, Bo...","[AL, AZ, CA, CO, CT, DC, FL, GA, IA, ID, IL, I...","[No, Yes]","[No, Yes]","[No, Yes]","[Never, Often, Rarely, Sometimes]","[1-5, 100-500, 26-100, 500-1000, 6-25, More th...",...,"[Somewhat difficult, Somewhat easy, Very diffi...","[No, Yes]","[No, Yes]","[No, Some of them, Yes]","[No, Some of them, Yes]","[No, Yes]","[No, Yes]","[No, Yes]","[No, Yes]",[A strong mind goes a long way. Stay strong. T...
missing values,[],[],[],[],[NA],[NA],[],[],[NA],[],...,[],[],[],[],[],[],[],[],[],"[ , -, NA]"
anomalous values,[],"[-1, -1726, -29, 99999999999]","[Female (cis), Female (trans), Guy (-ish) ^_^,...",[],[],[],[],[],[],[],...,[Don't know],[Maybe],[Maybe],[],[],[Maybe],[Maybe],[Don't know],[],[(yes but the situation was unusual and involv...


In [12]:
df2 = schema.transform(df)
df2

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,,,Often,6-25,...,Somewhat easy,,,Some of them,Yes,,,,,
1,2014-08-27 11:29:37,44,M,United States,IN,,,,Rarely,More than 1000,...,,,,No,No,,,,,
2,2014-08-27 11:29:44,32,Male,Canada,,,,,Rarely,6-25,...,Somewhat difficult,,,Yes,Yes,,,,,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,,,Often,26-100,...,Somewhat difficult,,,Some of them,No,,,,,
4,2014-08-27 11:30:22,31,Male,United States,TX,,,,Never,100-500,...,,,,Some of them,Yes,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,2015-09-12 11:17:21,26,male,United Kingdom,,,,,,26-100,...,Somewhat easy,,,Some of them,Some of them,,,,,
1255,2015-09-26 01:07:35,32,Male,United States,IL,,,,Often,26-100,...,Somewhat difficult,,,Some of them,Yes,,,,,
1256,2015-11-07 12:36:58,34,male,United States,CA,,,,Sometimes,More than 1000,...,Somewhat difficult,,,No,No,,,,,
1257,2015-11-30 21:25:06,46,f,United States,NC,,,,,100-500,...,,,,No,No,,,,,
