In [94]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from scipy.io import arff
from pandas import Series, DataFrame
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

### Preprocessing

In [36]:
data, meta = arff.loadarff('/Users/anilthapa/Downloads/autism+screening+adult/Autism-Adult-Data.arff')

In [40]:
df = DataFrame(data)
df.head(5)

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,b'1',b'1',b'1',b'1',b'0',b'0',b'1',b'1',b'0',b'0',...,b'f',b'White-European',b'no',b'no',b'United States',b'no',6.0,b'18 and more',b'Self',b'NO'
1,b'1',b'1',b'0',b'1',b'0',b'0',b'0',b'1',b'0',b'1',...,b'm',b'Latino',b'no',b'yes',b'Brazil',b'no',5.0,b'18 and more',b'Self',b'NO'
2,b'1',b'1',b'0',b'1',b'1',b'0',b'1',b'1',b'1',b'1',...,b'm',b'Latino',b'yes',b'yes',b'Spain',b'no',8.0,b'18 and more',b'Parent',b'YES'
3,b'1',b'1',b'0',b'1',b'0',b'0',b'1',b'1',b'0',b'1',...,b'f',b'White-European',b'no',b'yes',b'United States',b'no',6.0,b'18 and more',b'Self',b'NO'
4,b'1',b'0',b'0',b'0',b'0',b'0',b'0',b'1',b'0',b'0',...,b'f',b'?',b'no',b'no',b'Egypt',b'no',2.0,b'18 and more',b'?',b'NO'


In [41]:
df = df.select_dtypes([object])
df = df.stack().str.decode('utf-8').unstack()

In [42]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,f,White-European,no,no,United States,no,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,m,Latino,no,yes,Brazil,no,18 and more,Self,NO
2,1,1,0,1,1,0,1,1,1,1,m,Latino,yes,yes,Spain,no,18 and more,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,f,White-European,no,yes,United States,no,18 and more,Self,NO
4,1,0,0,0,0,0,0,1,0,0,f,?,no,no,Egypt,no,18 and more,?,NO


In [59]:
df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'gender', 'ethnicity',
       'jundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc',
       'relation', 'Class/ASD'],
      dtype='object')

In [103]:
for i, feature in enumerate(df.columns):
    print('Column: ',i+1, df[feature].unique())

Column:  1 ['1' '0']
Column:  2 ['1' '0']
Column:  3 ['1' '0']
Column:  4 ['1' '0']
Column:  5 ['0' '1']
Column:  6 ['0' '1']
Column:  7 ['1' '0']
Column:  8 ['1' '0']
Column:  9 ['0' '1']
Column:  10 ['0' '1']
Column:  11 ['f' 'm']
Column:  12 ['White-European' 'Latino' '?' 'Others' 'Black' 'Asian' 'Middle Eastern '
 'Pasifika' 'South Asian' 'Hispanic' 'Turkish']
Column:  13 ['no' 'yes']
Column:  14 ['no' 'yes']
Column:  15 ['United States' 'Brazil' 'Spain' 'Egypt' 'New Zealand' 'Bahamas'
 'Burundi' 'Austria' 'Argentina' 'Jordan' 'Ireland' 'United Arab Emirates'
 'Afghanistan' 'Lebanon' 'United Kingdom' 'South Africa' 'Italy'
 'Pakistan' 'Bangladesh' 'Chile' 'France' 'China' 'Australia' 'Canada'
 'Saudi Arabia' 'Netherlands' 'Romania' 'Sweden' 'Tonga' 'Oman' 'India'
 'Philippines' 'Sri Lanka' 'Sierra Leone' 'Ethiopia' 'Viet Nam' 'Iran'
 'Costa Rica' 'Germany' 'Mexico' 'Russia' 'Armenia' 'Iceland' 'Nicaragua'
 'Hong Kong' 'Japan' 'Ukraine' 'Kazakhstan' 'AmericanSamoa' 'Uruguay'
 'Serbi

In [63]:
df[df['ethnicity'] == 'others']

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,age_desc,relation,Class/ASD
657,1,1,0,0,1,0,1,0,1,0,m,others,no,no,United States,no,18 and more,Self,NO


In [84]:
df.drop(df[df['ethnicity'] == 'others'].index, inplace = True)

There is only one values that seems to have no ethinicity. So, we can go ahead and drop it

In [64]:
df[df['ethnicity'] == '?']

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,age_desc,relation,Class/ASD
4,1,0,0,0,0,0,0,1,0,0,f,?,no,no,Egypt,no,18 and more,?,NO
12,0,1,1,1,1,1,0,0,1,0,f,?,no,no,Bahamas,no,18 and more,?,NO
13,1,0,0,0,0,0,1,1,0,1,m,?,no,no,Austria,no,18 and more,?,NO
14,1,0,0,0,0,0,1,1,0,1,f,?,no,no,Argentina,no,18 and more,?,NO
19,0,0,0,0,0,0,1,1,0,1,m,?,yes,no,United Arab Emirates,no,18 and more,?,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,0,0,0,0,0,0,0,0,0,1,f,?,no,no,United States,no,18 and more,?,NO
658,0,0,1,1,0,0,1,0,0,0,m,?,no,no,Azerbaijan,no,18 and more,?,NO
659,1,1,1,1,1,1,0,0,1,1,m,?,no,no,Pakistan,no,18 and more,?,YES
666,0,0,0,0,0,0,0,1,0,0,m,?,no,no,Iraq,no,18 and more,?,NO


Here we can see that `ethnicity` columns contains a huge amount of missing data marked with `?`. We can also see another column (`relation`) on the far right, where it seems to have missing values marked as `?` as well. Now, we check to see if they are related to each other or not.

In [85]:
df[df['relation'] == '?'].equals(df[df['ethnicity'] == '?'])

True

We can conclude that both the missing values in `relation` and `ethnicity` columns seems to be related to each another

In [43]:
meta

Dataset: adult-weka.filters.unsupervised.attribute.NumericToNominal-Rfirst-10
	A1_Score's type is nominal, range is ('0', '1')
	A2_Score's type is nominal, range is ('0', '1')
	A3_Score's type is nominal, range is ('0', '1')
	A4_Score's type is nominal, range is ('0', '1')
	A5_Score's type is nominal, range is ('0', '1')
	A6_Score's type is nominal, range is ('0', '1')
	A7_Score's type is nominal, range is ('0', '1')
	A8_Score's type is nominal, range is ('0', '1')
	A9_Score's type is nominal, range is ('0', '1')
	A10_Score's type is nominal, range is ('0', '1')
	age's type is numeric
	gender's type is nominal, range is ('f', 'm')
	ethnicity's type is nominal, range is ('White-European', 'Latino', 'Others', 'Black', 'Asian', 'Middle Eastern ', 'Pasifika', 'South Asian', 'Hispanic', 'Turkish', 'others')
	jundice's type is nominal, range is ('no', 'yes')
	austim's type is nominal, range is ('no', 'yes')
	contry_of_res's type is nominal, range is ('United States', 'Brazil', 'Spain', 'Egyp

In [44]:
df.shape

(704, 19)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 704 entries, 0 to 703
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   A1_Score         704 non-null    object
 1   A2_Score         704 non-null    object
 2   A3_Score         704 non-null    object
 3   A4_Score         704 non-null    object
 4   A5_Score         704 non-null    object
 5   A6_Score         704 non-null    object
 6   A7_Score         704 non-null    object
 7   A8_Score         704 non-null    object
 8   A9_Score         704 non-null    object
 9   A10_Score        704 non-null    object
 10  gender           704 non-null    object
 11  ethnicity        704 non-null    object
 12  jundice          704 non-null    object
 13  austim           704 non-null    object
 14  contry_of_res    704 non-null    object
 15  used_app_before  704 non-null    object
 16  age_desc         704 non-null    object
 17  relation         704 non-null    object


In [48]:
df.describe()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,age_desc,relation,Class/ASD
count,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704
unique,2,2,2,2,2,2,2,2,2,2,2,12,2,2,67,2,1,6,2
top,1,0,0,0,0,0,0,1,0,1,m,White-European,no,no,United States,no,18 and more,Self,NO
freq,508,385,382,355,353,504,410,457,476,404,367,233,635,613,113,692,704,522,515


In [49]:
df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'gender', 'ethnicity',
       'jundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc',
       'relation', 'Class/ASD'],
      dtype='object')

In [50]:
X, y = df[['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'gender', 'ethnicity',
       'jundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc',
       'relation']], df['Class/ASD']

In [87]:
df['austim'] == df['Class/ASD']

0      False
1      False
2      False
3      False
4      False
       ...  
699    False
700    False
701    False
702    False
703    False
Length: 703, dtype: bool

In [53]:
y.value_counts()

Class/ASD
NO     515
YES    189
Name: count, dtype: int64

In [None]:
It seems tthat t

## Pipeline
### Creating a pipeline for preprocessing of categorical and numerical features seperately

In [58]:
X.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'gender', 'ethnicity',
       'jundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc',
       'relation'],
      dtype='object')

In [96]:
numeric_features = []

categorical_features = []

In [92]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [93]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [97]:
preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features)
                ]
)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', )
])

### Importing the Model and training the model

### Metrics

### Importing the Model and training the model

### Metrics