In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [104]:
data = pd.read_csv('data.csv')

In [105]:
X = data.drop(['id', 'diagnosis', 'Unnamed: 32'], axis=1)
y = data['diagnosis']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # Convert M/B to 1/0

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [107]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Decision Trees

In [108]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [109]:

# Define the parameter grid
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [110]:
dtree = DecisionTreeClassifier(random_state=42)

In [111]:
# Initialize Grid Search model
grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [112]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [113]:
best_params = grid_search.best_params_

In [114]:
best_tree = DecisionTreeClassifier(**best_params, random_state=42)
best_tree.fit(X_train, y_train)

In [115]:
y_pred_best = best_tree.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
report_best = classification_report(y_test, y_pred_best)

In [116]:
print("Best Parameters:", best_params)
print(f"Accuracy with Best Parameters: {accuracy_best}")
print("Classification Report with Best Parameters:")
print(report_best)
print("Even with hyperparameter tuning we see significant upgrade in recall but f1-score remains the same")

Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}
Accuracy with Best Parameters: 0.9473684210526315
Classification Report with Best Parameters:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96        71
           1       0.97      0.88      0.93        43

    accuracy                           0.95       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114

Even with hyperparameter tuning we see significant upgrade in recall but f1-score remains the same


#SVM classifier

In [117]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [118]:
# Standardize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [119]:
# Initialize and train SVM classifier
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

In [120]:
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [121]:
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.9824561403508771
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



#Logistic Regression

In [122]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [123]:
# Initialize Logistic Regression classifier
logistic_regression_classifier = LogisticRegression()

In [124]:
# Train
logistic_regression_classifier.fit(X_train, y_train)

In [125]:
# Predict
y_pred = logistic_regression_classifier.predict(X_test)

In [126]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

#classification report
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9736842105263158
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



#K-Nearest Neighbors

In [127]:
from sklearn.neighbors import KNeighborsClassifier

In [128]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
# Fit to training data
knn.fit(X_train, y_train)

#predictions
y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9473684210526315
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96        71
           1       0.93      0.93      0.93        43

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



#Random forest Classifier

In [129]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [130]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)

Accuracy: 0.9649122807017544
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [131]:
from sklearn.model_selection import GridSearchCV

In [132]:

#Creating fewer option for # of trees, limit max depth, fewer splitting criteria,
#and fewer choices of min number of samples at leaf.
param_grid_reduced = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

In [133]:
# Create a GridSearchCV object with reduced grid
grid_search_reduced = GridSearchCV(estimator=model, param_grid=param_grid_reduced,
                                   cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

In [134]:
# Fit GridSearchCV
grid_search_reduced.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [135]:
# Best parameters found
best_params_reduced = grid_search_reduced.best_params_
best_model_reduced = grid_search_reduced.best_estimator_

In [136]:
# predictions with best model
y_pred_best_reduced = best_model_reduced.predict(X_test)

In [137]:
# Evaluate  best model
accuracy_best_reduced = accuracy_score(y_test, y_pred_best_reduced)
report_best_reduced = classification_report(y_test, y_pred_best_reduced)

print("Best Model Parameters:", best_params_reduced)
print(f"Accuracy of Best Model: {accuracy_best_reduced}")
print(report_best_reduced)


Best Model Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy of Best Model: 0.9649122807017544
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



#Best model + Re-evaluation

In [138]:
svm_classifier = SVC()

# Fit the classifier on training data
svm_classifier.fit(X_train, y_train)

# Predict on test data
y_pred = svm_classifier.predict(X_test)

# Evaluate classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.9824561403508771
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

