In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Autism_Data.csv')
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,Gender,Ethnicity,Jaundice,Autism,Country,Used_app_before,Result,Age_desc,Relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,'United States',no,6,'18 and more',Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5,'18 and more',Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8,'18 and more',Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,'United States',no,6,'18 and more',Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,?,no,no,Egypt,no,2,'18 and more',?,NO


# Simplification

In [4]:
df.replace('?', pd.NA, inplace=True)
df.replace("'Health care professional'", 'Professional', inplace=True)

In [6]:
print(df.shape)
print(df.dtypes)
print(df.isnull().sum())

(704, 21)
A1_Score            int64
A2_Score            int64
A3_Score            int64
A4_Score            int64
A5_Score            int64
A6_Score            int64
A7_Score            int64
A8_Score            int64
A9_Score            int64
A10_Score           int64
Age                object
Gender             object
Ethnicity          object
Jaundice           object
Autism             object
Country            object
Used_app_before    object
Result              int64
Age_desc           object
Relation           object
Class/ASD          object
dtype: object
A1_Score            0
A2_Score            0
A3_Score            0
A4_Score            0
A5_Score            0
A6_Score            0
A7_Score            0
A8_Score            0
A9_Score            0
A10_Score           0
Age                 2
Gender              0
Ethnicity          95
Jaundice            0
Autism              0
Country             0
Used_app_before     0
Result              0
Age_desc            0
Relation    

In [42]:
# Convert 'Age' to numeric, setting errors='coerce' will turn non-numeric values into NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

print(df['Age'].head())

print(df['Age'].isnull().sum())

0    26.0
1    24.0
2    27.0
3    35.0
4    40.0
Name: Age, dtype: float64
2


# Data Imputation

In [55]:
# Fill missing values in 'Ethnicity' and 'Relation' with 'Unknown'
df['Ethnicity'].fillna('Unknown', inplace=True)
df['Relation'].fillna('Unknown', inplace=True)

# Fill missing values in 'Age' with the median value
df['Age'].fillna(df['Age'].median(), inplace=True)

# Verify there are no missing values left
print(df.isnull().sum())

A1_Score           0
A2_Score           0
A3_Score           0
A4_Score           0
A5_Score           0
A6_Score           0
A7_Score           0
A8_Score           0
A9_Score           0
A10_Score          0
Age                0
Gender             0
Ethnicity          0
Jaundice           0
Autism             0
Country            0
Used_app_before    0
Result             0
Age_desc           0
Relation           0
Class/ASD          0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Ethnicity'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


In [56]:
# Function to clean and format columns
def clean_format_column(df, column_name):
    if df[column_name].dtype == 'object':  # Only apply to object type columns
        df[column_name] = df[column_name].str.strip("'").str.strip().str.title()
    return df

# Columns to clean and format
columns_to_clean = ['Gender', 'Ethnicity', 'Jaundice', 'Autism', 'Country', 'Used_app_before', 'Relation', 'Class/ASD','Age_desc']

# Apply cleaning/formatting function to each column
for col in columns_to_clean:
    df = clean_format_column(df, col)

In [57]:
for column in df.columns:
    print(f"Unique values in {column}: {df[column].unique()}")

Unique values in A1_Score: [1 0]
Unique values in A2_Score: [1 0]
Unique values in A3_Score: [1 0]
Unique values in A4_Score: [1 0]
Unique values in A5_Score: [0 1]
Unique values in A6_Score: [0 1]
Unique values in A7_Score: [1 0]
Unique values in A8_Score: [1 0]
Unique values in A9_Score: [0 1]
Unique values in A10_Score: [0 1]
Unique values in Age: [ 26.  24.  27.  35.  40.  36.  17.  64.  29.  33.  18.  31.  30.  34.
  38.  42.  43.  48.  37.  55.  50.  53.  20.  28.  21. 383.  47.  32.
  44.  19.  58.  45.  22.  39.  25.  23.  54.  60.  41.  46.  56.  61.
  59.  52.  49.  51.]
Unique values in Gender: ['F' 'M']
Unique values in Ethnicity: ['White-European' 'Latino' 'Unknown' 'Others' 'Black' 'Asian'
 'Middle Eastern' 'Pasifika' 'South Asian' 'Hispanic' 'Turkish']
Unique values in Jaundice: ['No' 'Yes']
Unique values in Autism: ['No' 'Yes']
Unique values in Country: ['United States' 'Brazil' 'Spain' 'Egypt' 'New Zealand' 'Bahamas'
 'Burundi' 'Austria' 'Argentina' 'Jordan' 'Ireland' 

In [58]:
df.to_csv('Autism_Data_visual.csv', index=False, header=True)