In [167]:
import pandas as pd
import os
from collections import Counter
import matplotlib.pyplot as plt

def load_transactions():
    """
    Load transactions from the CSV file
    """
    current_dir = os.getcwd()
    filepath = os.path.join(current_dir, 'database', 'Autism_Data.csv')
    
    try:
        df = pd.read_csv(filepath)
        return df
    except FileNotFoundError:
        print(f"Error: database.csv not found in {current_dir}")
        return None

In [168]:

print("Loading database...")
df = load_transactions()
# Get columns that don't contain just 0s and 1s
non_binary_cols = []
for col in df.columns:
    unique_vals = df[col].unique()
    if not all(val in [0, 1] for val in unique_vals if pd.notna(val)):
        non_binary_cols.append(col)

print("Columns that are not binary (0/1):")
print(non_binary_cols)


Loading database...
Columns that are not binary (0/1):
['age', 'gender', 'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before', 'result', 'age_desc', 'relation', 'Class']


In [169]:
# Count frequency of each age value
unique_ages = df["age"].unique()
unique_ages.sort()

print("Unique ages:")
for age in unique_ages:
    print(f"Age {age}")


Unique ages:
Age 17
Age 18
Age 19
Age 20
Age 21
Age 22
Age 23
Age 24
Age 25
Age 26
Age 27
Age 28
Age 29
Age 30
Age 31
Age 32
Age 33
Age 34
Age 35
Age 36
Age 37
Age 38
Age 383
Age 39
Age 40
Age 41
Age 42
Age 43
Age 44
Age 45
Age 46
Age 47
Age 48
Age 49
Age 50
Age 51
Age 52
Age 53
Age 54
Age 55
Age 56
Age 58
Age 59
Age 60
Age 61
Age 64
Age ?


In [170]:
# Drop rows where age is '?' or 383
df = df[~df['age'].isin(['?', '383'])]
# Count frequency of each age value
unique_ages = df["age"].unique()
unique_ages.sort()

print("Unique ages:")
for age in unique_ages:
    print(f"Age {age}")


Unique ages:
Age 17
Age 18
Age 19
Age 20
Age 21
Age 22
Age 23
Age 24
Age 25
Age 26
Age 27
Age 28
Age 29
Age 30
Age 31
Age 32
Age 33
Age 34
Age 35
Age 36
Age 37
Age 38
Age 39
Age 40
Age 41
Age 42
Age 43
Age 44
Age 45
Age 46
Age 47
Age 48
Age 49
Age 50
Age 51
Age 52
Age 53
Age 54
Age 55
Age 56
Age 58
Age 59
Age 60
Age 61
Age 64


In [171]:
# Convert age column to numeric type
df['age'] = pd.to_numeric(df['age'])

# Count frequency of each age value
unique_ages = df["age"].unique()
unique_ages.sort()

print("Unique ages:")
for age in unique_ages:
    print(f"Age {age}")


Unique ages:
Age 17
Age 18
Age 19
Age 20
Age 21
Age 22
Age 23
Age 24
Age 25
Age 26
Age 27
Age 28
Age 29
Age 30
Age 31
Age 32
Age 33
Age 34
Age 35
Age 36
Age 37
Age 38
Age 39
Age 40
Age 41
Age 42
Age 43
Age 44
Age 45
Age 46
Age 47
Age 48
Age 49
Age 50
Age 51
Age 52
Age 53
Age 54
Age 55
Age 56
Age 58
Age 59
Age 60
Age 61
Age 64


In [172]:
df['gender'] = df['gender'].apply(lambda x: 1 if x.upper() == 'M' else 0)


In [173]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class
0,1,1,1,1,0,0,1,1,0,0,...,0,White-European,no,no,'United States',no,6,'18 and more',Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,1,Latino,no,yes,Brazil,no,5,'18 and more',Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,1,Latino,yes,yes,Spain,no,8,'18 and more',Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,0,White-European,no,yes,'United States',no,6,'18 and more',Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,0,?,no,no,Egypt,no,2,'18 and more',?,NO


In [174]:
# Get unique values in ethnicity column
unique_ethnicities = df['ethnicity'].unique()

unique_ethnicities

array(['White-European', 'Latino', '?', 'Others', 'Black', 'Asian',
       "'Middle Eastern '", 'Pasifika', "'South Asian'", 'Hispanic',
       'Turkish', 'others'], dtype=object)

In [175]:
# Replace 'Middle Eastern ' with 'Middle Eastern' (removing extra space and quote)
df['ethnicity'] = df['ethnicity'].replace("'Middle Eastern '", 'Middle Eastern')
df['ethnicity'] = df['ethnicity'].replace("'South Asian'", 'South Asian')

# Verify the change
print("\nUnique ethnicities after cleaning Middle Eastern values:")
print(df['ethnicity'].unique())



Unique ethnicities after cleaning Middle Eastern values:
['White-European' 'Latino' '?' 'Others' 'Black' 'Asian' 'Middle Eastern'
 'Pasifika' 'South Asian' 'Hispanic' 'Turkish' 'others']


In [176]:
# Get unique values in ethnicity column
unique_ethnicities = df['ethnicity'].unique()

unique_ethnicities

array(['White-European', 'Latino', '?', 'Others', 'Black', 'Asian',
       'Middle Eastern', 'Pasifika', 'South Asian', 'Hispanic', 'Turkish',
       'others'], dtype=object)

In [177]:
# Drop rows where ethnicity is '?'
df = df[df['ethnicity'] != '?']


In [178]:
df['ethnicity'].value_counts()


ethnicity
White-European    233
Asian             123
Middle Eastern     92
Black              43
South Asian        36
Others             30
Latino             20
Hispanic           13
Pasifika           11
Turkish             6
others              1
Name: count, dtype: int64

In [179]:
df=df[df['ethnicity'] != 'others']


In [180]:
df['ethnicity'] = df['ethnicity'].str.strip().str.lower()


In [181]:
df['ethnicity'].value_counts()


ethnicity
white-european    233
asian             123
middle eastern     92
black              43
south asian        36
others             30
latino             20
hispanic           13
pasifika           11
turkish             6
Name: count, dtype: int64

In [182]:
# Count occurrences
ethnicity_counts = df['ethnicity'].value_counts()

# Keep ethnicities with at least 50 samples (example threshold)
main_ethnicities = ethnicity_counts[ethnicity_counts >= 40].index

# Group all others as 'other'
df['ethnicity_grouped'] = df['ethnicity'].apply(lambda x: x if x in main_ethnicities else 'other')


In [183]:
df['ethnicity_grouped'].value_counts()


ethnicity_grouped
white-european    233
asian             123
other             116
middle eastern     92
black              43
Name: count, dtype: int64

In [184]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class,ethnicity_grouped
0,1,1,1,1,0,0,1,1,0,0,...,white-european,no,no,'United States',no,6,'18 and more',Self,NO,white-european
1,1,1,0,1,0,0,0,1,0,1,...,latino,no,yes,Brazil,no,5,'18 and more',Self,NO,other
2,1,1,0,1,1,0,1,1,1,1,...,latino,yes,yes,Spain,no,8,'18 and more',Parent,YES,other
3,1,1,0,1,0,0,1,1,0,1,...,white-european,no,yes,'United States',no,6,'18 and more',Self,NO,white-european
5,1,1,1,1,1,0,1,1,1,1,...,others,yes,no,'United States',no,9,'18 and more',Self,YES,other


In [185]:
df.drop(columns=['ethnicity'], inplace=True)


In [186]:
df.head()


Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class,ethnicity_grouped
0,1,1,1,1,0,0,1,1,0,0,...,0,no,no,'United States',no,6,'18 and more',Self,NO,white-european
1,1,1,0,1,0,0,0,1,0,1,...,1,no,yes,Brazil,no,5,'18 and more',Self,NO,other
2,1,1,0,1,1,0,1,1,1,1,...,1,yes,yes,Spain,no,8,'18 and more',Parent,YES,other
3,1,1,0,1,0,0,1,1,0,1,...,0,no,yes,'United States',no,6,'18 and more',Self,NO,white-european
5,1,1,1,1,1,0,1,1,1,1,...,1,yes,no,'United States',no,9,'18 and more',Self,YES,other


In [188]:
# Convert to dummies with 0/1 values, then replace with True/False
df = pd.get_dummies(df, columns=['ethnicity_grouped'], prefix='ethnicity', drop_first=True)
df = df.replace({True: 1, False: 0})
df.head()


  df = df.replace({True: 1, False: 0})


Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,contry_of_res,used_app_before,result,age_desc,relation,Class,ethnicity_black,ethnicity_middle eastern,ethnicity_other,ethnicity_white-european
0,1,1,1,1,0,0,1,1,0,0,...,'United States',no,6,'18 and more',Self,NO,0,0,0,1
1,1,1,0,1,0,0,0,1,0,1,...,Brazil,no,5,'18 and more',Self,NO,0,0,1,0
2,1,1,0,1,1,0,1,1,1,1,...,Spain,no,8,'18 and more',Parent,YES,0,0,1,0
3,1,1,0,1,0,0,1,1,0,1,...,'United States',no,6,'18 and more',Self,NO,0,0,0,1
5,1,1,1,1,1,0,1,1,1,1,...,'United States',no,9,'18 and more',Self,YES,0,0,1,0


In [189]:
df.drop(columns=['age_desc'], inplace=True)


In [193]:
# Display unique values in country_of_res column
print("Unique countries in the dataset:")
print(df['contry_of_res'].unique())

# Show value counts to see distribution
print("\nDistribution of countries:")
print(df['contry_of_res'].value_counts())
# Get value counts
country_counts = df['contry_of_res'].value_counts()

# Drop countries with less than 66 occurrences
df = df[df['contry_of_res'].isin(country_counts[country_counts > 66].index)]

print("\nAfter dropping less frequent countries:")
print(df['contry_of_res'].value_counts())


Unique countries in the dataset:
["'United States'" 'other' "'New Zealand'" "'United Arab Emirates'"
 "'United Kingdom'" 'India']

Distribution of countries:
contry_of_res
other                     197
'United States'           111
India                      81
'United Kingdom'           76
'New Zealand'              75
'United Arab Emirates'     67
Name: count, dtype: int64

After dropping less frequent countries:
contry_of_res
other                     197
'United States'           111
India                      81
'United Kingdom'           76
'New Zealand'              75
'United Arab Emirates'     67
Name: count, dtype: int64


In [195]:
df['contry_of_res'].value_counts()
# Drop rows where contry_of_res is 'other'
df = df[df['contry_of_res'] != 'other']
df['contry_of_res'].value_counts()



contry_of_res
'United States'           111
India                      81
'United Kingdom'           76
'New Zealand'              75
'United Arab Emirates'     67
Name: count, dtype: int64

In [196]:
df['contry_of_res'].value_counts()


contry_of_res
'United States'           111
India                      81
'United Kingdom'           76
'New Zealand'              75
'United Arab Emirates'     67
Name: count, dtype: int64

In [197]:
# Replace specific country values with proper formatting
df['contry_of_res'] = df['contry_of_res'].replace({
    "'United States'": "United States",
    "'United Kingdom'": "United Kingdom", 
    "'New Zealand'": "New Zealand",
    "'United Arab Emirates'": "United Arab Emirates"
})

# Verify the changes
print("Updated country values:")
print(df['contry_of_res'].value_counts())


Updated country values:
contry_of_res
United States           111
India                    81
United Kingdom           76
New Zealand              75
United Arab Emirates     67
Name: count, dtype: int64


In [198]:
df['contry_of_res'].value_counts()


contry_of_res
United States           111
India                    81
United Kingdom           76
New Zealand              75
United Arab Emirates     67
Name: count, dtype: int64

In [199]:
df.head(2)


Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,austim,contry_of_res,used_app_before,result,relation,Class,ethnicity_black,ethnicity_middle eastern,ethnicity_other,ethnicity_white-european
0,1,1,1,1,0,0,1,1,0,0,...,no,United States,no,6,Self,NO,0,0,0,1
3,1,1,0,1,0,0,1,1,0,1,...,yes,United States,no,6,Self,NO,0,0,0,1


In [200]:
df['austim'] = df['austim'].replace({'no': 0, 'yes':1})

df['austim'].value_counts()

df.head()



  df['austim'] = df['austim'].replace({'no': 0, 'yes':1})


Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,austim,contry_of_res,used_app_before,result,relation,Class,ethnicity_black,ethnicity_middle eastern,ethnicity_other,ethnicity_white-european
0,1,1,1,1,0,0,1,1,0,0,...,0,United States,no,6,Self,NO,0,0,0,1
3,1,1,0,1,0,0,1,1,0,1,...,1,United States,no,6,Self,NO,0,0,0,1
5,1,1,1,1,1,0,1,1,1,1,...,0,United States,no,9,Self,YES,0,0,1,0
6,0,1,0,0,0,0,0,1,0,0,...,0,United States,no,2,Self,NO,1,0,0,0
7,1,1,1,1,0,0,0,0,1,0,...,0,New Zealand,no,5,Parent,NO,0,0,0,1


In [201]:
df['used_app_before'] = df['used_app_before'].replace({'no': 0, 'yes':1})

df['used_app_before'].value_counts()

df.head()

  df['used_app_before'] = df['used_app_before'].replace({'no': 0, 'yes':1})


Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,austim,contry_of_res,used_app_before,result,relation,Class,ethnicity_black,ethnicity_middle eastern,ethnicity_other,ethnicity_white-european
0,1,1,1,1,0,0,1,1,0,0,...,0,United States,0,6,Self,NO,0,0,0,1
3,1,1,0,1,0,0,1,1,0,1,...,1,United States,0,6,Self,NO,0,0,0,1
5,1,1,1,1,1,0,1,1,1,1,...,0,United States,0,9,Self,YES,0,0,1,0
6,0,1,0,0,0,0,0,1,0,0,...,0,United States,0,2,Self,NO,1,0,0,0
7,1,1,1,1,0,0,0,0,1,0,...,0,New Zealand,0,5,Parent,NO,0,0,0,1


In [202]:
df.drop(columns=['result'], inplace=True)


In [208]:
df['jundice'].value_counts()

jundice
0    370
1     40
Name: count, dtype: int64

In [207]:
df['jundice'] = df['jundice'].replace({'no': 0, 'yes':1})


  df['jundice'] = df['jundice'].replace({'no': 0, 'yes':1})


In [210]:
df.columns


Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'jundice', 'austim', 'contry_of_res', 'used_app_before', 'relation',
       'Class', 'ethnicity_black', 'ethnicity_middle eastern',
       'ethnicity_other', 'ethnicity_white-european'],
      dtype='object')

In [215]:
df['contry_of_res'].value_counts()



contry_of_res
United States           111
India                    81
United Kingdom           76
New Zealand              75
United Arab Emirates     67
Name: count, dtype: int64

In [216]:
top_n = 3
top_countries = df['contry_of_res'].value_counts().nlargest(top_n).index

In [217]:
df['country_grouped'] = df['contry_of_res'].apply(lambda x: x if x in top_countries else 'Other')


In [218]:
dummies = pd.get_dummies(df['country_grouped'], prefix='country')


In [237]:
df = pd.concat([df, dummies], axis=1)


In [238]:
df.sample(10)


Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,Class,ethnicity_black,ethnicity_middle eastern,ethnicity_other,ethnicity_white-european,country_grouped,country_India,country_Other,country_United Kingdom,country_United States
394,1,0,0,0,1,1,0,1,0,1,...,0,1,0,0,0,Other,False,True,False,False
633,1,1,0,0,1,0,0,1,1,1,...,0,0,1,0,0,India,True,False,False,False
473,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,India,True,False,False,False
454,1,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,Other,False,True,False,False
31,1,0,0,1,1,1,1,1,0,1,...,1,0,0,0,1,United States,False,False,False,True
3,1,1,0,1,0,0,1,1,0,1,...,0,0,0,0,1,United States,False,False,False,True
240,1,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,Other,False,True,False,False
635,0,1,1,0,0,0,1,0,0,1,...,0,0,0,0,1,United Kingdom,False,False,True,False
125,1,1,0,0,1,1,1,1,0,1,...,1,0,0,0,1,Other,False,True,False,False
516,1,1,0,1,0,0,1,0,0,0,...,0,0,0,0,1,United States,False,False,False,True


In [239]:
df.replace({'NO': 0, 'YES': 1, 'no': 0, 'yes': 1}, inplace=True)


In [240]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,Class,ethnicity_black,ethnicity_middle eastern,ethnicity_other,ethnicity_white-european,country_grouped,country_India,country_Other,country_United Kingdom,country_United States
0,1,1,1,1,0,0,1,1,0,0,...,0,0,0,0,1,United States,False,False,False,True
3,1,1,0,1,0,0,1,1,0,1,...,0,0,0,0,1,United States,False,False,False,True
5,1,1,1,1,1,0,1,1,1,1,...,1,0,0,1,0,United States,False,False,False,True
6,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,United States,False,False,False,True
7,1,1,1,1,0,0,0,0,1,0,...,0,0,0,0,1,Other,False,True,False,False


In [244]:
df = df.replace({True: 1, False: 0})

  df = df.replace({True: 1, False: 0})


In [245]:
df

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,Class,ethnicity_black,ethnicity_middle eastern,ethnicity_other,ethnicity_white-european,country_grouped,country_India,country_Other,country_United Kingdom,country_United States
0,1,1,1,1,0,0,1,1,0,0,...,0,0,0,0,1,United States,0,0,0,1
3,1,1,0,1,0,0,1,1,0,1,...,0,0,0,0,1,United States,0,0,0,1
5,1,1,1,1,1,0,1,1,1,1,...,1,0,0,1,0,United States,0,0,0,1
6,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,United States,0,0,0,1
7,1,1,1,1,0,0,0,0,1,0,...,0,0,0,0,1,Other,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,India,1,0,0,0
691,1,0,0,1,0,0,1,1,0,0,...,0,1,0,0,0,United States,0,0,0,1
692,1,1,1,0,1,1,1,1,0,1,...,1,0,0,0,1,United States,0,0,0,1
693,1,0,0,1,0,0,0,1,0,1,...,0,0,0,0,1,United Kingdom,0,0,1,0


In [246]:
df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'jundice', 'austim', 'contry_of_res', 'used_app_before', 'relation',
       'Class', 'ethnicity_black', 'ethnicity_middle eastern',
       'ethnicity_other', 'ethnicity_white-european', 'country_grouped',
       'country_India', 'country_Other', 'country_United Kingdom',
       'country_United States'],
      dtype='object')

In [253]:
# Group relation into Parent, Self, or Other
df['relation'] = df['relation'].apply(lambda x: x if x in ['Parent', 'Self'] else 'Other')

# Get dummies for the grouped relation
relation_dummies = pd.get_dummies(df['relation'], prefix='relation')

# Add dummies to dataframe
df = pd.concat([df, relation_dummies], axis=1)

# Show the counts of the grouped relation
df['relation'].value_counts()

relation
Self      358
Parent     26
Other      26
Name: count, dtype: int64

In [258]:
df = df.replace({True: 1, False: 0})


In [260]:
df = df.drop(columns=['relation'])


In [261]:
df = df.drop(columns=['country_grouped'])


In [263]:
df.columns

Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',
       'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',
       'jundice', 'austim', 'contry_of_res', 'used_app_before', 'Class',
       'ethnicity_black', 'ethnicity_middle eastern', 'ethnicity_other',
       'ethnicity_white-european', 'country_India', 'country_Other',
       'country_United Kingdom', 'country_United States', 'relation_Other',
       'relation_Parent', 'relation_Self'],
      dtype='object')

In [269]:
df.rename(columns={'Class': 'class'}, inplace=True)


In [270]:
df['class'].value_counts()


class
0    306
1    104
Name: count, dtype: int64

In [272]:
df.applymap(type).eq(str).any().any()



  df.applymap(type).eq(str).any().any()


np.True_