In [3]:

import numpy as np
import pandas as pd
import sklearn
import plotly.express as px
import os
from sklearn.pipeline import make_pipeline
from sklearn.utils import check_array
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# To ignore warnings
import warnings
warnings.filterwarnings("ignore")





#   EXPLORATORY DATA ANALYSIS   


In [5]:
# os.chdir('...\\AUTISM_SCREENING_FOR_TODDLERS\\archive')
df = pd.read_csv('../../data/Toddler_Autism_dataset_July_2018.csv')

In [6]:
df.columns

Index(['Case_No', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10',
       'Age_Mons', 'Qchat-10-Score', 'Sex', 'Ethnicity', 'Jaundice',
       'Family_mem_with_ASD', 'Who completed the test', 'Class/ASD Traits '],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes


In [8]:
df.shape
print(f"The df has {df.shape[0]} rows and {df.shape[1]} columns")

The df has 1054 rows and 19 columns


In [9]:
df.rename(columns ={"Class/ASD Traits ":"target"}, inplace=True)
df.head()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,target
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,Yes
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes


In [10]:
df.target = np.where(df.target =='Yes',1,0)
df.head()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,target
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,0
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,1
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,1
3,4,1,1,1,1,1,1,1,1,1,1,24,10,m,Hispanic,no,no,family member,1
4,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,1


In [11]:
yes = df[df['target']==1]
yes.describe()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,target
count,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0
mean,543.781593,0.730769,0.603022,0.535714,0.681319,0.712912,0.76511,0.82967,0.601648,0.682692,0.645604,28.223901,6.788462,1.0
std,291.080202,0.443865,0.489608,0.499066,0.466286,0.452714,0.424222,0.376181,0.489895,0.465748,0.478659,7.491995,1.921937,0.0
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,4.0,1.0
25%,302.75,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,24.0,5.0,1.0
50%,544.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,30.0,7.0,1.0
75%,789.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,36.0,8.0,1.0
max,1054.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,36.0,10.0,1.0


In [12]:
yes = df[df['target']==1]
yes.describe()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,target
count,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0,728.0
mean,543.781593,0.730769,0.603022,0.535714,0.681319,0.712912,0.76511,0.82967,0.601648,0.682692,0.645604,28.223901,6.788462,1.0
std,291.080202,0.443865,0.489608,0.499066,0.466286,0.452714,0.424222,0.376181,0.489895,0.465748,0.478659,7.491995,1.921937,0.0
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,4.0,1.0
25%,302.75,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,24.0,5.0,1.0
50%,544.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,30.0,7.0,1.0
75%,789.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,36.0,8.0,1.0
max,1054.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,36.0,10.0,1.0


In [13]:
no = df[df['target']==0]
no.describe()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,target
count,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0
mean,491.141104,0.190184,0.104294,0.101227,0.134969,0.104294,0.156442,0.248466,0.141104,0.058282,0.453988,27.070552,1.693252,0.0
std,329.8223,0.39305,0.306112,0.302093,0.342216,0.306112,0.363832,0.432788,0.348664,0.234636,0.498644,8.936593,1.066014,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0
25%,173.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,1.0,0.0
50%,472.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,2.0,0.0
75%,796.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,36.0,3.0,0.0
max,1053.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,36.0,3.0,0.0


In [14]:
"""
The minimum value of Qchat-10-Score variable for yes(1) (autistic) is 4.
The maximum value of Qchat-10-Score variable for no(0) (non-autistic) is 3.
"""
df1 = df[['target','Qchat-10-Score']]
df1.describe()

Unnamed: 0,target,Qchat-10-Score
count,1054.0,1054.0
mean,0.690702,5.212524
std,0.462424,2.907304
min,0.0,0.0
25%,0.0,3.0
50%,1.0,5.0
75%,1.0,8.0
max,1.0,10.0


In [15]:
"""
The minimum value of Qchat-10-Score variable for yes(1) (autistic) is 4.
The maximum value of Qchat-10-Score variable for no(0) (non-autistic) is 3.
"""
df1 = df[['target','Qchat-10-Score']]
df1.describe()

Unnamed: 0,target,Qchat-10-Score
count,1054.0,1054.0
mean,0.690702,5.212524
std,0.462424,2.907304
min,0.0,0.0
25%,0.0,3.0
50%,1.0,5.0
75%,1.0,8.0
max,1.0,10.0


In [16]:
df1.corr()# 81 % correlation

Unnamed: 0,target,Qchat-10-Score
target,1.0,0.810423
Qchat-10-Score,0.810423,1.0


In [17]:
df1_yes = df1[df1['Qchat-10-Score'] >3]
df1_no = df1[df1['Qchat-10-Score'] <=3]

In [18]:
df1_yes.describe()

Unnamed: 0,target,Qchat-10-Score
count,728.0,728.0
mean,1.0,6.788462
std,0.0,1.921937
min,1.0,4.0
25%,1.0,5.0
50%,1.0,7.0
75%,1.0,8.0
max,1.0,10.0


In [19]:
df1_no.describe()

Unnamed: 0,target,Qchat-10-Score
count,326.0,326.0
mean,0.0,1.693252
std,0.0,1.066014
min,0.0,0.0
25%,0.0,1.0
50%,0.0,2.0
75%,0.0,3.0
max,0.0,3.0


In [20]:
df1_yes.target.unique()

array([1])

In [21]:
df1_no.target.unique()

array([0])

#### The result shows that for:
    Qchat-10-Score >3 target variable is 1 i.e. patient has autism.
    Qchat-10-Score <=3 target variable is 0 i.e. patient does not have autism.
    
So by just using this information i.e. with just the Qchat-10-Score variable 
and the cut of value of 3, we can correctly predict whether a toddler has autism 
or not. This is a classical case of data leakage that is the independent 
variable contains information about the target variable hence when ever we train
the model if the train test split is done in such a way that the model recognizes
this information then we will almost everytime get a very high score. We may be
happy that our model is doing so well. In real lif when new dataset comes it may
not contain this information in the Qchat-10-Score variable and hence our model
may not perform well. 
To check this let us :
    1.first train the model with the Qchat-10-Score variable on the toddler 
    dataset and validate our model on the git hub data set. 
    2. secondly train the model without the Qchat-10-Score variable on the
    toddler dataset and validate our model on the git hub data set.
    3. thirdly train the model with the Qchat-10-Score variable on the combined 
    dataset and validate our model on the combined data set. 
    $. fourthly train the model without the Qchat-10-Score variable on the combined 
    dataset and validate our model on the combined data set. 
***

In [22]:

df1_yes.target.value_counts(), df1_no.target.value_counts()


(1    728
 Name: target, dtype: int64,
 0    326
 Name: target, dtype: int64)

In [25]:
def data_preprocess(df):
    #Get the new dataset from github
    # os.chdir(r'...\AUTISM_SCREENING_FOR_TODDLERS\archive\github_data\Data-Analytics-model-on-Behavioural-Challenges-of-ASD-kids')
    # os.listdir()
    dff = pd.read_csv('../../data/data_csv.csv')
    dff['Sex'] = np.where(dff['Sex']=='F','f','m')
    dff1 = dff[[ 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8','A9', 'A10_Autism_Spectrum_Quotient','Age_Years', 'Qchat_10_Score',  'Sex', 'Ethnicity', 'Jaundice','Family_mem_with_ASD', 'Who_completed_the_test', 'ASD_traits']]
    #max age in toddlers dataset is 36 months i.e. 3 years
    df.Age_Mons.max()
    #filter from the new data all records with 'Age_Years' <= 3
    dff2 = dff1[dff1['Age_Years']<=3]
    yes = dff2[dff2['ASD_traits']=='Yes']
    yes.describe()
    no = dff2[dff2['ASD_traits']=='No']
    a = df[['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10','Age_Mons', 'Qchat-10-Score', 'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who completed the test', 'target']]
    b = dff2
    b.columns
    b.rename(columns={'A10_Autism_Spectrum_Quotient':'A10'},inplace = True)
    b.rename(columns = {'ASD_traits':'target'},inplace = True)
    b['Age_Mons']=b.Age_Years*12
    #b.drop(['Qchat_10_Score','Age_Years'],axis=1,inplace = True)
    b.drop(['Age_Years'],axis=1,inplace = True)
    b.target = np.where(b['target'] == 'Yes',1,0)
    a.rename(columns = {'Who completed the test':'Who_completed_the_test'},inplace = True)
    a.rename(columns = {'Qchat-10-Score':'Qchat_10_Score'},inplace = True)
    b = b[['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons','Qchat_10_Score','Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD','Who_completed_the_test', 'target']]
    b.columns
    a.columns
    b['Qchat_10_Score'] = b['Qchat_10_Score'].fillna(0).astype(np.int64)
    a['dataset']='toddler'
    b['dataset']='github'
    c=a.append(b)
    ### save the preprocessed files
    a.to_csv(r'../../data/pre_processed/original_data.csv',index = False)
    b.to_csv(r'../../data/pre_processed/github_data.csv',index = False)
    c.to_csv(r'../../data/pre_processed/combined_data.csv',index = False)


In [26]:
data_preprocess(df)

#### Read the files saved from before

In [27]:
aa = pd.read_csv(r'../../data/pre_processed/original_data.csv')
bb = pd.read_csv(r'../../data/pre_processed/github_data.csv')
cc = pd.read_csv(r'../../data/pre_processed/combined_data.csv')
aa.columns, bb.columns, cc.columns

(Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
        'Qchat_10_Score', 'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD',
        'Who_completed_the_test', 'target', 'dataset'],
       dtype='object'),
 Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
        'Qchat_10_Score', 'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD',
        'Who_completed_the_test', 'target', 'dataset'],
       dtype='object'),
 Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
        'Qchat_10_Score', 'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD',
        'Who_completed_the_test', 'target', 'dataset'],
       dtype='object'))

#### Create dummy variable for categorical variables

In [29]:
def get_dummy(df):
    #Introducing dummy variables for all categorical variables by dropping the first dummy variable
    Sex = pd.get_dummies(df.Sex, prefix='Sex', drop_first=True)
    Ethnicity = pd.get_dummies(df.Ethnicity, prefix='Ethnicity', drop_first=True)
    Jaundice = pd.get_dummies(df.Jaundice, prefix='Jaundice', drop_first=True)
    Family_mem_with_ASD = pd.get_dummies(df.Family_mem_with_ASD, prefix='Family_mem_with_ASD', drop_first=True)
    Who_completed_the_test = pd.get_dummies(df["Who_completed_the_test"], prefix='Who_completed_the_test', drop_first=True)
    #Introducing dummy variables for all categorical variables by dropping the first dummy variable
    df.drop(["Sex","Ethnicity","Jaundice","Family_mem_with_ASD","Who_completed_the_test"], axis = 1,inplace=True)
    df =  pd.concat([df, Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who_completed_the_test ], axis=1)
    return(df)

def save_files():
    aaa = get_dummy(aa)
    bbb = get_dummy(bb)
    ccc = get_dummy(cc)
    aaa.columns
    bbb.columns
    ccc.columns

    aaa.to_csv(r'../../data/pre_processed/original_data_one_hot_encoded.csv',index = False)
    bbb.to_csv(r'../../data/pre_processed/github_data_one_hot_encoded.csv',index = False)
    ccc.to_csv(r'../../data/pre_processed/combined_data_one_hot_encoded.csv',index = False)

In [30]:
save_files()

# END OF EXPLORATORY DATA ANALYSIS  

# -------------------------------------------------------------------------------------

# Train the model to see the effect of Qchat-10-Score variable

#### Read the files saved one hot encoded files

In [31]:
aa = pd.read_csv(r'../../data/pre_processed/original_data_one_hot_encoded.csv')
bb = pd.read_csv(r'../../data/pre_processed/github_data_one_hot_encoded.csv')
cc = pd.read_csv(r'../../data/pre_processed/combined_data_one_hot_encoded.csv')

In [32]:
aa.columns, bb.columns, cc.columns

(Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
        'Qchat_10_Score', 'target', 'dataset', 'Sex_m', 'Ethnicity_Latino',
        'Ethnicity_Native Indian', 'Ethnicity_Others', 'Ethnicity_Pacifica',
        'Ethnicity_White European', 'Ethnicity_asian', 'Ethnicity_black',
        'Ethnicity_middle eastern', 'Ethnicity_mixed', 'Ethnicity_south asian',
        'Jaundice_yes', 'Family_mem_with_ASD_yes',
        'Who_completed_the_test_Health care professional',
        'Who_completed_the_test_Others', 'Who_completed_the_test_Self',
        'Who_completed_the_test_family member'],
       dtype='object'),
 Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
        'Qchat_10_Score', 'target', 'dataset', 'Sex_m', 'Ethnicity_Black',
        'Ethnicity_Hispanic', 'Ethnicity_Latino', 'Ethnicity_Middle Eastern',
        'Ethnicity_Mixed', 'Ethnicity_Native Indian', 'Ethnicity_Others',
        'Ethnicity_South Asian', 'Ethnicity_White

In [33]:
aa.shape, bb.shape, cc.shape # (147, 32)

((1054, 31), (147, 32), (1201, 42))

In [34]:
## define functions to train the models
def train_model1(df1,test_size):
    df1.drop('dataset',axis = 1, inplace=True)
    # Putting feature variable to X
    X = df1.drop(['target'], axis=1)
    # Puttting response variable to y
    y = df1.loc[:,['target']]

    # Splitting the data into train and test with test size as 30% and random state as 101

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= test_size)
    # Pipeline Estimator 
    standardscaler =StandardScaler()
    radomforestclassifier = RandomForestClassifier(n_jobs = -1,verbose = 0)
    pipeline = make_pipeline(standardscaler,radomforestclassifier)
    # fit model on training data
    pipeline.fit(X_train,y_train)


    # Predict the sales of the test data
    y_test['pred'] = pipeline.predict(X_test)
    from sklearn import metrics
    # testing score
    score = metrics.f1_score(y_test['target'], y_test['pred'],labels=None, pos_label=1)

    print("F1 score for test data is : ",score)
    print("Accuracy score for test data is : ",metrics.accuracy_score(y_test['target'], y_test['pred']))
    print('train_test_split_ratio is : ', test_size)


def train_model2(df1,df2):
    # Putting feature variable to X
    X_train = df1.drop(['target'], axis=1)
    # Puttting response variable to y
    y_train = df1.loc[:,['target']]


    # Putting feature variable to X
    X_test = df2.drop(['target'], axis=1)
    # Puttting response variable to y
    y_test = df2.loc[:,['target']]


    standardscaler =StandardScaler()
    radomforestclassifier = RandomForestClassifier(n_jobs = -1,verbose = 0)
    pipeline = make_pipeline(standardscaler,radomforestclassifier)
    # fit model on training data
    pipeline.fit(X_train,y_train)

    #Test data is the github dataset
    X_test.columns


    # Predict the sales of the test data
    y_test['pred'] = pipeline.predict(X_test)
    from sklearn import metrics
    # testing score
    score = metrics.f1_score(y_test['target'], y_test['pred'],labels=None, pos_label=1)

    print("F1 score for test data is : ",score)
    print("Accuracy score for test data is : ",metrics.accuracy_score(y_test['target'], y_test['pred']))




In [35]:
######################################################################################################
# 1. First train the model with the Qchat-10-Score variable on the toddler 
#    dataset and validate our model on the test data of the toddler data set. 
###########################################################################################################################
df1 = pd.read_csv(r'../../data/pre_processed/original_data_one_hot_encoded.csv')
train_model1(df1, test_size=0.25)

#Result
"""
F1 score for test data is :  1.0
Accuracy score for test data is :  1.0
"""

F1 score for test data is :  1.0
Accuracy score for test data is :  1.0
train_test_split_ratio is :  0.25


'\nF1 score for test data is :  1.0\nAccuracy score for test data is :  1.0\n'

In [37]:
###########################################################################################################################
# 2. Secondly train the model with the Qchat-10-Score variable on the toddler 
#    dataset and validate our model on the git hub data set. 
###########################################################################################################################

df =  pd.read_csv(r'../../data/pre_processed/combined_data_one_hot_encoded.csv')
df1 = df[df['dataset']=='toddler']
df2 = df[df['dataset']=='github']
df1.drop('dataset',axis = 1, inplace=True)
df2.drop('dataset',axis = 1, inplace=True)

train_model2(df1, df2)
#Result
"""
F1 score for test data is :  0.8488372093023255
Accuracy score for test data is :  0.8231292517006803
"""

F1 score for test data is :  0.8488372093023255
Accuracy score for test data is :  0.8231292517006803


'\nF1 score for test data is :  0.8488372093023255\nAccuracy score for test data is :  0.8231292517006803\n'

In [38]:
###########################################################################################################################
# 3. Thirdly train the model without the Qchat-10-Score variable on the toddler 
#    dataset and validate our model on the git hub data set. 
###########################################################################################################################

df =  pd.read_csv(r'../../data/pre_processed/combined_data_one_hot_encoded.csv')
df.drop('Qchat_10_Score',axis = 1, inplace=True)
df1 = df[df['dataset']=='toddler']
df2 = df[df['dataset']=='github']
df1.drop('dataset',axis = 1, inplace=True)
df2.drop('dataset',axis = 1, inplace=True)

train_model2(df1, df2)
# Result
"""
F1 score for test data is :  0.9466666666666668
Accuracy score for test data is :  0.9455782312925171
"""

F1 score for test data is :  0.9403973509933775
Accuracy score for test data is :  0.9387755102040817


'\nF1 score for test data is :  0.9466666666666668\nAccuracy score for test data is :  0.9455782312925171\n'

In [39]:

###########################################################################################################################
# 4. Fourthly train the model with the Qchat-10-Score variable on the combined 
#    dataset and validate our model on the combined data set. 
###########################################################################################################################

for i in range(10,50):
    df1 =  pd.read_csv(r'../../data/pre_processed/combined_data_one_hot_encoded.csv')
    train_model1(df1, test_size=i*0.01)
    print(" Value of i is :", i)



F1 score for test data is :  1.0
Accuracy score for test data is :  1.0
train_test_split_ratio is :  0.1
 Value of i is : 10
F1 score for test data is :  1.0
Accuracy score for test data is :  1.0
train_test_split_ratio is :  0.11
 Value of i is : 11
F1 score for test data is :  1.0
Accuracy score for test data is :  1.0
train_test_split_ratio is :  0.12
 Value of i is : 12
F1 score for test data is :  0.9949238578680203
Accuracy score for test data is :  0.9936305732484076
train_test_split_ratio is :  0.13
 Value of i is : 13
F1 score for test data is :  0.995433789954338
Accuracy score for test data is :  0.9940828402366864
train_test_split_ratio is :  0.14
 Value of i is : 14
F1 score for test data is :  0.9960159362549801
Accuracy score for test data is :  0.994475138121547
train_test_split_ratio is :  0.15
 Value of i is : 15
F1 score for test data is :  0.9925925925925926
Accuracy score for test data is :  0.9896373056994818
train_test_split_ratio is :  0.16
 Value of i is : 16
F

In [40]:
###########################################################################################################################
# 5. Fifth train the model without the Qchat-10-Score variable on the combined
#    dataset and validate our model on the combined data set.
###########################################################################################################################

for i in range(10,50):
    df1 =  pd.read_csv(r'../../data/pre_processed/combined_data_one_hot_encoded.csv')
    df1.drop('Qchat_10_Score',axis = 1, inplace=True)
    train_model1(df1, test_size=i*0.01)
    print(" Value of i is :", i)

###########################################################################################################33


F1 score for test data is :  0.9689440993788819
Accuracy score for test data is :  0.9586776859504132
train_test_split_ratio is :  0.1
 Value of i is : 10
F1 score for test data is :  0.96045197740113
Accuracy score for test data is :  0.9473684210526315
train_test_split_ratio is :  0.11
 Value of i is : 11
F1 score for test data is :  0.9852216748768472
Accuracy score for test data is :  0.9793103448275862
train_test_split_ratio is :  0.12
 Value of i is : 12
F1 score for test data is :  0.9636363636363636
Accuracy score for test data is :  0.9490445859872612
train_test_split_ratio is :  0.13
 Value of i is : 13
F1 score for test data is :  0.9865470852017937
Accuracy score for test data is :  0.9822485207100592
train_test_split_ratio is :  0.14
 Value of i is : 14
F1 score for test data is :  0.962962962962963
Accuracy score for test data is :  0.9502762430939227
train_test_split_ratio is :  0.15
 Value of i is : 15
F1 score for test data is :  0.9617021276595744
Accuracy score for t

In [41]:
cc =  pd.read_csv(r'../../data/pre_processed/combined_data_one_hot_encoded.csv')

In [42]:
cc.columns

Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
       'Qchat_10_Score', 'target', 'dataset', 'Sex_m', 'Ethnicity_Black',
       'Ethnicity_Hispanic', 'Ethnicity_Latino', 'Ethnicity_Middle Eastern',
       'Ethnicity_Mixed', 'Ethnicity_Native Indian', 'Ethnicity_Others',
       'Ethnicity_Pacifica', 'Ethnicity_South Asian',
       'Ethnicity_White European', 'Ethnicity_asian', 'Ethnicity_black',
       'Ethnicity_middle eastern', 'Ethnicity_mixed', 'Ethnicity_south asian',
       'Jaundice_Yes', 'Jaundice_no', 'Jaundice_yes',
       'Family_mem_with_ASD_Yes', 'Family_mem_with_ASD_no',
       'Family_mem_with_ASD_yes', 'Who_completed_the_test_Family member',
       'Who_completed_the_test_Health Care Professional',
       'Who_completed_the_test_Health care professional',
       'Who_completed_the_test_Others', 'Who_completed_the_test_Self',
       'Who_completed_the_test_family member'],
      dtype='object')

In [43]:
df = cc[['Qchat_10_Score','target']]
df_1 = df[df['target']==1]
df_0 = df[df['target']==0]

In [44]:
df_1.describe(), df_0.describe()

(       Qchat_10_Score  target
 count      807.000000   807.0
 mean         6.785626     1.0
 std          1.987527     0.0
 min          0.000000     1.0
 25%          5.000000     1.0
 50%          7.000000     1.0
 75%          8.000000     1.0
 max         10.000000     1.0,
        Qchat_10_Score  target
 count      394.000000   394.0
 mean         2.101523     0.0
 std          1.906157     0.0
 min          0.000000     0.0
 25%          1.000000     0.0
 50%          2.000000     0.0
 75%          3.000000     0.0
 max         10.000000     0.0)

In [45]:
cc.target.value_counts()

1    807
0    394
Name: target, dtype: int64

In [46]:
df_1['Qchat_10_Score'].value_counts(), df_0['Qchat_10_Score'].value_counts()

(7     150
 5     128
 4     115
 8     109
 6     108
 9     106
 10     85
 0       4
 1       2
 Name: Qchat_10_Score, dtype: int64,
 3     117
 1     103
 2      99
 0      55
 10     12
 6       3
 9       3
 4       1
 7       1
 Name: Qchat_10_Score, dtype: int64)

#  Further actions to investigate.

#### 1. Remove outliers from 'Qchat_10_Score' variable for both yes(1) and no (0).
#### 2. Read the criteria for the scoring 'Qchat_10_Score' variable in both the datasets and see if we can do any further data cleaning.
#### 3. Include the data upto maybe 5 years (60 months) instead of the present 3 years (36months) as one participant pointed out that its important to observe the child upto 5 years. Need to verify this.

# ----------------------------------------------  Thank You!   -------------------------------------------