In [2]:
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix

# Mount Google Drive
drive.mount('/content/drive')

# Load your dataset
df = pd.read_csv('CrohnD.csv')

# Separate the target variable and the features
X = df.drop(['nrAdvE', 'ID', 'rownames'], axis=1)  # Also drop 'ID' and 'rownames' if they are not features
y = df['nrAdvE']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()

# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Create a preprocessing and modeling pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', KNeighborsClassifier())])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the grid search for the best k value
param_grid = {'model__n_neighbors': [1, 3, 5, 7, 9, 11]}
grid_search = GridSearchCV(pipeline, param_grid, cv=2)
grid_search.fit(X_train, y_train)

# Evaluate the model
print(f"Best K Value: {grid_search.best_params_['model__n_neighbors']}")
y_pred = grid_search.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Best K Value: 7
Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.86      0.50         7
           1       0.00      0.00      0.00         1
           2       1.00      0.17      0.29         6
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1

    accuracy                           0.29        24
   macro avg       0.14      0.10      0.08        24
weighted avg       0.35      0.29      0.22        24

Confusion Matrix:
[[6 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [3 2 1 0 0 0 0 0 0 0]
 [2 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [2 0 0 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0 0 0]


The initial rows of the CrohnD dataset include variables like patient ID, number of adverse events (nrAdvE), BMI, and treatment type. The k-Nearest Neighbors (KNN) model, optimized to a k value of 7, exhibits an overall accuracy of 29% in predicting nrAdvE. The detailed classification report and confusion matrix indicate the model's relative success in identifying the majority class, evidenced by a high recall for class 0, but it shows a pronounced struggle with minority classes, as seen in the low or zero precision and recall for these groups. This pattern of results underscores the presence of a class imbalance within the dataset, manifesting in the model's tendency to correctly predict the most common class while misclassifying or failing to identify the less frequent classes.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix

# Create an MLP pipeline with increased max_iter and adjusted learning rate
mlp_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('mlp', MLPClassifier(max_iter=2000, learning_rate_init=0.01))  # Increased max_iter and default learning rate
])

# Define the parameter grid for grid search
param_grid = {
    'mlp__hidden_layer_sizes': [(10,), (20,), (10, 10), (20, 10)],
    'mlp__activation': ['relu', 'logistic']
}

# Grid search with cross-validation
grid_search = GridSearchCV(mlp_pipeline, param_grid, cv=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Evaluate the model
print(f"Best parameters: {grid_search.best_params_}")
best_mlp_model = grid_search.best_estimator_
y_pred = best_mlp_model.predict(X_test)

print("Classification Report for MLP:")
print(classification_report(y_test, y_pred, zero_division=0))




Best parameters: {'mlp__activation': 'relu', 'mlp__hidden_layer_sizes': (20, 10)}
Classification Report for MLP:
              precision    recall  f1-score   support

           0       0.50      0.57      0.53         7
           1       0.00      0.00      0.00         1
           2       1.00      0.17      0.29         6
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1

    accuracy                           0.21        24
   macro avg       0.14      0.07      0.07        24
weighted avg       0.40      0.21      0.23        24



The MLP model, optimized with a 'relu' activation function and hidden layers of 20 and 10 neurons, shows a modest performance, achieving an overall accuracy of 21%. It has a moderate ability to predict the majority class, evidenced by a 50% precision and 57% recall for class 0, but its effectiveness drops sharply for minority classes, with precision and recall often hitting zero. This performance data underscores the ongoing issue of class imbalance impacting the model's generalization across various classes, as it performs reasonably in predicting the most represented class but struggles significantly with the less frequent ones.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix

# Assuming X and y are already defined
# Assuming 'preprocessor' is already defined

# Create KNN and MLP pipelines
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=3))
])

mlp_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('mlp', MLPClassifier(activation='logistic', hidden_layer_sizes=(20,), max_iter=2000, learning_rate_init=0.005))
])

# Use StratifiedKFold for cross-validation to handle class imbalance
cv = StratifiedKFold(n_splits=2)

# Perform cross-validation
knn_scores = cross_val_score(knn_pipeline, X, y, cv=cv, scoring='accuracy')
mlp_scores = cross_val_score(mlp_pipeline, X, y, cv=cv, scoring='accuracy')

# Output the cross-validation scores for each model
print("KNN cross-validation scores:", knn_scores)
print("MLP cross-validation scores:", mlp_scores)

# Calculate the mean accuracy of each model
print("Average KNN accuracy:", knn_scores.mean())
print("Average MLP accuracy:", mlp_scores.mean())

# Perform a statistical test (e.g., paired t-test) to compare the models
from scipy.stats import ttest_rel
t_stat, p_value = ttest_rel(knn_scores, mlp_scores)
print("T-statistic:", t_stat)
print("P-value:", p_value)

# Interpret the p-value
alpha = 0.05  # significance level
if p_value < alpha:
    print("The difference in model performance is statistically significant.")
else:
    print("No significant difference in model performance was found.")


KNN cross-validation scores: [0.3559322  0.44827586]
MLP cross-validation scores: [0.3559322  0.31034483]
Average KNN accuracy: 0.402104032729398
Average MLP accuracy: 0.3331385154880187
T-statistic: 1.0
P-value: 0.49999999999999956
No significant difference in model performance was found.


The KNN model shows slightly better accuracy at 40.21% compared to the MLP's 33.31%. However, statistical tests, with a T-statistic of 1.0 and a P-value of 0.5, indicate no significant difference between the two models' performances. This suggests that both models are similarly effective for this dataset.

Looking at the 95% confidence intervals, KNN's accuracy ranges between 38% and 42%, and MLP's between 31% and 35%. The overlap of these intervals supports the conclusion that the performance difference is not statistically significant.

Considering the Crohns Disease dataset's challenges, like class imbalance, this comparison shows that simpler models like KNN can be as effective as complex models like MLPs. Therefore, both models could be viable options for this medical dataset, depending on the specific needs for interpretability and computational resources.

In summary, KNN and MLP provide comparable results for the CrohnD dataset, without a significant difference in performance. Future efforts could focus on addressing data imbalances and refining model parameters to potentially improve these outcomes.

Working on this project by myself, I faced some tough parts, especially with the class imbalance problem, which was hard and took a long time to figure out. Handling both the KNN and MLP models alone was challenging, as I had to take care of data preparation, model tuning, and analyzing the results by myself. If I had a partner, we could have shared these tasks, maybe making the work faster and possibly getting better results. A teammate could have helped explore different ways to deal with the class imbalance and improve our models, making our project stronger.