In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# import and transform breast-cancer.csv into panda dataframe from https://www.kaggle.com/datasets/yasserh/breast-cancer-dataset/data
breast_cancer= pd.read_csv('breast-cancer.csv')
#print(breast_cancer.columns)
#breast_cancer.head() shows that  we need to transform the values in diagnosis from M to 1  and B to 0 
breast_cancer['diagnosis']= breast_cancer['diagnosis'].str.replace(pat="M",repl="1",regex=False)
breast_cancer['diagnosis']= breast_cancer['diagnosis'].str.replace(pat="B",repl="0",regex=False)
breast_cancer['diagnosis'] = breast_cancer['diagnosis'].astype(int)

# scale features using the StandarScaler from sklearn
scaler=StandardScaler()
breast_cancer.iloc[:,2:] = scaler.fit_transform(breast_cancer.iloc[:,2:])
# breast_cancer.head() shows the new transformed values
                                        

# split data into X for features and y for Labels 
X= breast_cancer.drop(columns=['id','diagnosis'], axis=1)
y= breast_cancer['diagnosis']

# split data into trainig and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

# Build and train Logistic Regression Model
model= LogisticRegression()
model.fit(X_train,y_train)

# Make predictions for X_test
y_pred= model.predict(X_test)

print(accuracy_score(y_test,y_pred)) #97,37%
print(precision_score(y_test,y_pred)) #97,89%
print(recall_score(y_test,y_pred)) #95,83%
print(confusion_matrix(y_test,y_pred))

# Build and train an optimized SVM-Model
SVM_model= SVC(probability=True)

svm_param = {"C": [.01, .1, 1, 5, 10, 100],             
             "gamma": [.01, .1, 1, 5, 10, 100],
             "kernel": ["rbf"],
             "random_state": [1]}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(SVM_model, svm_param, n_iter=36, scoring='roc_auc', n_jobs=-1, cv=5, random_state=1)

SVM_model = RCV.fit(X_train, y_train).best_estimator_

# Make predictions for X_test
SVM_y_pred= SVM_model.predict(X_test)
pred_prob = SVM.predict_proba(X_test)


print(accuracy_score(y_test,SVM_y_pred)) # 97.37%
print(f1_score(y_test,SVM_y_pred)) # 96,68%
print(confusion_matrix(y_test,SVM_y_pred))


0.9736842105263158
0.9787234042553191
0.9583333333333334
[[65  1]
 [ 2 46]]
0.9736842105263158
0.967741935483871
[[66  0]
 [ 3 45]]


In [None]:
breast_cancer