In [17]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [18]:
# 1. Load the Data
df = pd.read_csv('Autism_Screening_Data_Combined.csv')
print("Data preview:")
print(df.head())

Data preview:
   A1  A2  A3  A4  A5  A6  A7  A8  A9  A10  Age Sex Jauundice Family_ASD Class
0   1   1   0   1   0   0   1   1   0    0   15   m        no         no    NO
1   0   1   1   1   0   1   1   0   1    0   15   m        no         no    NO
2   1   1   1   0   1   1   1   1   1    1   15   f        no        yes   YES
3   1   1   1   1   1   1   1   1   0    0   16   f        no         no   YES
4   1   1   1   1   1   1   1   1   1    1   15   f        no         no   YES


In [19]:
# 2. Preprocess the Data
# Map the target column 'Class' from 'NO'/'YES' to 0/1
df['Class'] = df['Class'].map({'NO': 0, 'YES': 1})

In [20]:
# Encode categorical columns:
# For Sex, we can map 'm' to 0 and 'f' to 1 (or vice versa)
df['Sex'] = df['Sex'].map({'m': 0, 'f': 1})

In [21]:
# For Jauundice and Family_ASD, map 'no' to 0 and 'yes' to 1.
# (Note: Even if one of these columns appears constant in a subset, keep them for consistency.)
df['Jauundice'] = df['Jauundice'].map({'no': 0, 'yes': 1})
df['Family_ASD'] = df['Family_ASD'].map({'no': 0, 'yes': 1})


In [22]:
# Define features and target
X = df.drop('Class', axis=1)
y = df['Class']

In [23]:
# 3. Split the Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Create a Pipeline for Scaling and Model Building
# Using StandardScaler for feature scaling (important for some algorithms) and RandomForestClassifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [24]:
# 5. Hyperparameter Tuning
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10]
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

Best parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 200}


In [26]:
# 6. Save the Model using Pickle
with open('autism_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
    
print("Model saved successfully as autism_model.pkl")

Model saved successfully as autism_model.pkl


In [None]:
# 7. Evaluate on Test Data
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nTest Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 0.988477366255144

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       848
           1       0.99      0.98      0.98       367

    accuracy                           0.99      1215
   macro avg       0.99      0.98      0.99      1215
weighted avg       0.99      0.99      0.99      1215


Confusion Matrix:
[[843   5]
 [  9 358]]
