In [60]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples for each class
n_samples = 500

# Helper function to generate data
def generate_data(n, condition):
    data = []
    for _ in range(n):
        if condition == "dyslexia":
            reading_speed = np.random.uniform(50, 70)  # Words per minute (lower)
            reading_accuracy = np.random.uniform(60, 75)  # Percentage (lower)
            math_speed = np.random.uniform(80, 99)  # Math problems per hour (slightly lower)
            math_accuracy = np.random.uniform(85, 95)  # Math accuracy percentage (normal)
        elif condition == "dyscalculia":
            reading_speed = np.random.uniform(90, 110)  # Words per minute (normal)
            reading_accuracy = np.random.uniform(90, 99)  # Percentage (normal)
            math_speed = np.random.uniform(30, 50)  # Math problems per hour (lower)
            math_accuracy = np.random.uniform(40, 60)  # Math accuracy percentage (lower)
        elif condition == "both":
            reading_speed = np.random.uniform(40, 60)  # Words per minute (lower)
            reading_accuracy = np.random.uniform(50, 70)  # Percentage (lower)
            math_speed = np.random.uniform(25, 40)  # Math problems per hour (lower)
            math_accuracy = np.random.uniform(30, 50)  # Math accuracy percentage (lower)
        else:  # Normal (control group)
            reading_speed = np.random.uniform(99, 120)  # Words per minute
            reading_accuracy = np.random.uniform(95, 99)  # Percentage
            math_speed = np.random.uniform(85, 99)  # Math problems per hour
            math_accuracy = np.random.uniform(90, 98)  # Math accuracy percentage

        # attention_span = np.random.uniform(70 if condition in ["dyslexia", "dyscalculia", "both"] else 90, 95)  # Percentage
        memory_score = np.random.uniform(60 if condition in ["dyslexia", "dyscalculia", "both"] else 85, 95)  # Percentage

        data.append([
            reading_speed,
            reading_accuracy,
            math_speed,
            math_accuracy,
            # attention_span,
            memory_score,
            condition
        ])
    return data

# Generate synthetic data for each condition
data_normal = generate_data(n_samples, "normal")
data_dyslexia = generate_data(n_samples, "dyslexia")
data_dyscalculia = generate_data(n_samples, "dyscalculia")
data_both = generate_data(n_samples, "both")

# Combine all data
columns = [
    "Reading_Speed", 
    "Reading_Accuracy", 
    "Math_Speed", 
    "Math_Accuracy", 
    # "Attention_Span", 
    "Memory_Score", 
    "Condition"
]
all_data = pd.DataFrame(data_normal + data_dyslexia + data_dyscalculia + data_both, columns=columns)

# Shuffle the dataset
all_data = all_data.sample(frac=1).reset_index(drop=True)

# Save to a CSV file
all_data.to_csv("adjusted_synthetic_dyslexia_dyscalculia_dataset.csv", index=False)

print("Adjusted synthetic dataset generated and saved as 'adjusted_synthetic_dyslexia_dyscalculia_dataset.csv'.")
print(all_data.head())


Adjusted synthetic dataset generated and saved as 'adjusted_synthetic_dyslexia_dyscalculia_dataset.csv'.
   Reading_Speed  Reading_Accuracy  Math_Speed  Math_Accuracy  Memory_Score  \
0      55.748192         65.516790   81.103747      86.115122     78.055167   
1      50.027020         50.867944   30.227170      33.583593     82.647456   
2     102.547636         96.114361   87.478147      90.709620     86.206359   
3      59.792882         66.231645   80.456096      88.353716     66.197879   
4      64.999556         73.623427   94.415591      90.971553     82.894646   

  Condition  
0  dyslexia  
1      both  
2    normal  
3  dyslexia  
4  dyslexia  


In [143]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples for each class
n_samples = 3000

# Helper function to generate data
def generate_data(n, condition):
    data = []
    for _ in range(n):
        if condition == "dyslexia":
            reading_speed = np.random.uniform(10, 60)  # Overlap with normal
            reading_accuracy = np.random.uniform(10, 75)  # Overlap with normal
            math_speed = np.random.uniform(65, 90)  # Overlap with normal
            math_accuracy = np.random.uniform(70, 95)  # Overlap with dyscalculia
        elif condition == "dyscalculia":
            reading_speed = np.random.uniform(70, 140)  # Overlap with normal
            reading_accuracy = np.random.uniform(80, 99)  # Overlap with normal
            math_speed = np.random.uniform(10, 70)  # Overlap with both
            math_accuracy = np.random.uniform(10, 70)  # Overlap with both
        elif condition == "both":
            reading_speed = np.random.uniform(10, 70)  # Overlap with dyslexia
            reading_accuracy = np.random.uniform(10, 75)  # Overlap with dyslexia
            math_speed = np.random.uniform(10, 65)  # Overlap with dyscalculia
            math_accuracy = np.random.uniform(5, 60)  # Overlap with dyscalculia
        else:  # Normal (control group)
            reading_speed = np.random.uniform(80, 140)  # Slight overlap with dyslexia
            reading_accuracy = np.random.uniform(85, 99)  # Slight overlap with dyslexia and dyscalculia
            math_speed = np.random.uniform(70, 140)  # Slight overlap with dyslexia
            math_accuracy = np.random.uniform(80, 98)  # Slight overlap with dyslexia

        # Slight overlap for attention span and memory score
        # attention_span = np.random.uniform(75 if condition in ["dyslexia", "dyscalculia", "both"] else 85, 95)
        memory_score = np.random.uniform(65 if condition in [ "dyscalculia", "both"] else 70, 100)

        data.append([
            reading_speed,
            reading_accuracy,
            math_speed,
            math_accuracy,
            # attention_span,
            memory_score,
            condition
        ])
    return data

# Generate synthetic data for each condition
data_normal = generate_data(n_samples, "normal")
data_dyslexia = generate_data(n_samples, "dyslexia")
data_dyscalculia = generate_data(n_samples, "dyscalculia")
data_both = generate_data(n_samples, "both")

# Combine all data
columns = [
    "Reading_Speed", 
    "Reading_Accuracy", 
    "Math_Speed", 
    "Math_Accuracy", 
    # "Attention_Span", 
    "Memory_Score", 
    "Condition"
]
all_data = pd.DataFrame(data_normal + data_dyslexia + data_dyscalculia + data_both, columns=columns)

# Shuffle the dataset
all_data = all_data.sample(frac=1).reset_index(drop=True)

# Save to a CSV file
all_data.to_csv("adjusted_synthetic_dyslexia_dyscalculia_dataset.csv", index=False)

print("Adjusted synthetic dataset with overlaps generated and saved as 'adjusted_synthetic_dyslexia_dyscalculia_dataset.csv'.")
print(all_data.head())


Adjusted synthetic dataset with overlaps generated and saved as 'adjusted_synthetic_dyslexia_dyscalculia_dataset.csv'.
   Reading_Speed  Reading_Accuracy  Math_Speed  Math_Accuracy  Memory_Score  \
0     118.428563         98.705736  112.244517      86.440654     89.434523   
1      44.784194         71.740328   16.722199      32.627295     89.047384   
2     104.100113         83.325139   16.083636      28.797541     81.282234   
3      60.125712         70.720012   52.274211      49.248349     95.495792   
4      50.879727         45.376231   65.804911      85.473059     71.661722   

     Condition  
0       normal  
1         both  
2  dyscalculia  
3         both  
4     dyslexia  


In [144]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
import warnings

warnings.filterwarnings("ignore")

# Load the dataset
data = pd.read_csv("adjusted_synthetic_dyslexia_dyscalculia_dataset.csv")

# Preprocessing
# Separate features and target
X = data.drop(columns=["Condition"])
y = data["Condition"]

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.99, random_state=42)

# Feature selection
selector = SelectKBest(score_func=f_classif, k=5)  # Select top 5 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Hyperparameter tuning for Random Forest
rf_params = {
    "n_estimators": [50, 100, 150],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
}
rf_model = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf_model, rf_params, cv=3, scoring="accuracy")
rf_grid.fit(X_train_scaled, y_train)

# Hyperparameter tuning for SVM
svm_params = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
}
svm_model = SVC(random_state=42)
svm_grid = GridSearchCV(svm_model, svm_params, cv=3, scoring="accuracy")
svm_grid.fit(X_train_scaled, y_train)

# Initialize other models
knn_model = KNeighborsClassifier(n_neighbors=5)
lr_model = LogisticRegression(random_state=42, penalty='l2', C=0.5)
nb_model = GaussianNB()

# Train and evaluate all models
models = {
    "Random Forest": rf_grid.best_estimator_,
    "SVM": svm_grid.best_estimator_,
    "K-Nearest Neighbors": knn_model,
    "Logistic Regression": lr_model,
    "Naive Bayes": nb_model,
}

results = []
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {model_name}: {acc:.4f}")
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred, target_names=label_encoder.classes_)}")
    
    results.append((model_name, acc))

# Summary of results
results_df = pd.DataFrame(results, columns=["Model", "Accuracy"])
print("\nSummary of Model Performance:")
print(results_df)

# Save the results
results_df.to_csv("model_performance_summary.csv", index=False)


Training Random Forest...
Accuracy for Random Forest: 0.9998
Classification Report for Random Forest:
              precision    recall  f1-score   support

        both       1.00      1.00      1.00      2968
 dyscalculia       1.00      1.00      1.00      2970
    dyslexia       1.00      1.00      1.00      2974
      normal       1.00      1.00      1.00      2968

    accuracy                           1.00     11880
   macro avg       1.00      1.00      1.00     11880
weighted avg       1.00      1.00      1.00     11880

Training SVM...
Accuracy for SVM: 0.9978
Classification Report for SVM:
              precision    recall  f1-score   support

        both       1.00      0.99      1.00      2968
 dyscalculia       0.99      1.00      1.00      2970
    dyslexia       1.00      1.00      1.00      2974
      normal       1.00      1.00      1.00      2968

    accuracy                           1.00     11880
   macro avg       1.00      1.00      1.00     11880
weighted av

In [145]:
import joblib

# Save the KNN model to a file
knn_model_filename = "knn_model.pkl"
joblib.dump(knn_model, knn_model_filename)
print(f"KNN model saved as {knn_model_filename}")


# Save the scaler
scaler_filename = "scaler.pkl"
joblib.dump(scaler, scaler_filename)
print(f"Scaler saved as {scaler_filename}")

# Save the feature selector
selector_filename = "selector.pkl"
joblib.dump(selector, selector_filename)
print(f"Feature selector saved as {selector_filename}")


# Save the label encoder to a file
joblib.dump(label_encoder, "label_encoder.pkl")

print("Label encoder saved successfully!")


KNN model saved as knn_model.pkl
Scaler saved as scaler.pkl
Feature selector saved as selector.pkl
Label encoder saved successfully!


In [148]:
import joblib
import numpy as np

# Load the saved KNN model, scaler, and feature selector
knn_model = joblib.load("knn_model.pkl")
scaler = joblib.load("scaler.pkl")
selector = joblib.load("selector.pkl")

# Define the function for prediction
def predict_condition(features):
    """
    Predict the condition using the KNN model.

    Parameters:
    - features: list or np.ndarray
        A list or 1D array containing feature values in the same order as the training data.

    Returns:
    - prediction: str
        Predicted condition label.
    """
    # Ensure the input is a numpy array
    features = np.array(features).reshape(1, -1)

    # Apply feature selection
    selected_features = selector.transform(features)

    # Scale the features
    scaled_features = scaler.transform(selected_features)

    # Make a prediction
    prediction_encoded = knn_model.predict(scaled_features)

    # Decode the prediction to its original label
    prediction_label = label_encoder.inverse_transform(prediction_encoded)
    
    return prediction_label[0]


    # Example input values (ensure they are in the correct order)
# example_features = [117.66188132791392,100.06116117124812,80.13248169221794,99.27262613533308,89.54033049802344,98.90247500836509]  # Replace with actual feature values
example_features = [50,76,40,50,95]  # Replace with actual feature values

    # Get the prediction
predicted_condition = predict_condition(example_features)
print(f"Predicted Condition: {predicted_condition}")


Predicted Condition: dyscalculia
