In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings
warnings.filterwarnings('ignore')

In this notebook, you will find the code to predicting breast cancer in a datatset using the Support Vector Machines classifier. The algorithm, finds a hyperplane between two classifications in a higher dimensional space by using a kernel function. We will also look at the implementation of Logistic Regression, as both the algorithms are designed for classifcation and compare the accuracy scores. 

**Step 1: Import the necesary modules.**

In [None]:
import numpy as np # For efficiently carrying out array computations 
import pandas as pd # For CSV I/O 
import matplotlib.pyplot as plt # For data visualization. 
import seaborn as sns # For data visualization as well.

# Importing the necessary algorithms and error functions. 
from sklearn.preprocessing import MinMaxScaler, StandardScaler # For feature scaling
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
# For splitting the datasets and finding the best model using GridSearch
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
# For finding the error of the algorithm trained. 
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# The algorithm itself. 


**Step 2: Loading the data and getting a feel for it.**

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-dataset/breast-cancer.csv') # Loading the data
df.head()
# df.describe() 

**Step 3: Analyze the dataset.** 
* This involves various data visualizations to find the hidden relationships between each variable. 

In [None]:
df['diagnosis'] = [1 if i == "M" else 0 for i in df['diagnosis']]
df.drop('id', axis=1)

In [None]:
corr = df.corr()
fig = plt.figure(figsize = (20, 16))
sns.heatmap(corr, annot=True, 
           cmap='magma', 
           fmt=".1f")
plt.show()

In [None]:
fig, ax = plt.subplots()
sns.kdeplot(df['radius_mean'], label="Radius")
sns.kdeplot(df['perimeter_mean'], label = "Perimeter")
sns.kdeplot(df['texture_mean'], label = "Texture")
ax.legend()

**Step 4: Preprocess the data**
* This involves feature scaling so that the algorithm works better for the given dataset. 

In [None]:
corr_val = abs(corr["diagnosis"])
#Select only the highly correlated values. 
relevant = corr_val[corr_val > 0.4]
relevant = list(corr_val.index)
relevant.remove("id")
relevant.remove("diagnosis")

X = df[relevant]
Y = df["diagnosis"]

# print(X)
# print(Y)

# We got confirmation that by printing these lists, we have the values we need. 

In [None]:
# Now, we will perform feature scaling for the dataset "X" using StandardScaler and convert the list to a dataframe
X = StandardScaler().fit_transform(X) # mean = 0, standard deviation = 1
X = pd.DataFrame(X)
X.head()

**Step 5: Splitting the training and testing data, as well as finding the best model parameters for SVM.**
* This is the second-to-last step before completing the program, whewre the last step will be to find the model with the least amount of error and print that onto the console. 

In [None]:
# Splitting X and Y into training and testing datasets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42)

In [None]:
# To find the best model, we use a variety of arguments. 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # For splitting the datasets in the same proportion
svc = SVC()
svc_args = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

best_model = GridSearchCV(estimator = svc,
                          param_grid = svc_args,
                          cv = cv, 
                          verbose = 1, 
                          scoring = 'roc_auc')

result_svc = best_model.fit(x_train, y_train)
result_svc.best_params_

The result: 
* C: 100
* gamma: 0.001
* kernel: rbf (radial basis function)

are the three best fitted parameters for the Support Vectors Classifier. It was much easier to use GridSearchCV than manually altering each C, gamma and kernel value in 
order to get the right output. 

In [None]:
# Using the best parameters gotten from GridSearch, we are training the SVC algorithm on those params. 
svc = svc.set_params(**result_svc.best_params_)
svc.fit(x_train, y_train)

#Getting the parameters and finding the predictions using the testing dataset.
prediction = svc.predict(x_test) 
print(classification_report(y_test, prediction))
print(confusion_matrix(y_test, prediction))
print(f"ROC-AUC Score: {roc_auc_score(y_test, prediction)}")
print(f"Accuracy Score: {accuracy_score(y_test, prediction)}")

As you can see, we have ended up with a 97% accuracy score, which is quite good. The algorithm has done well. 

**COMPARISON: Logistic Regression vs. Support Vector Machine**
* This is an extra: This isn't required. Both algorithms are used for classifiacation problems, and to me at least, it's interesting to see which algorithm does the job better. 

In [None]:
log_regr = LogisticRegression() 

log_args = {
    'penalty':['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 0.9, 1, 2, 10, 100],
    'solver': ['newton-cg', 'lbfgs''liblinear', 'sag', 'saga']
}

grid_lgr = GridSearchCV(estimator = log_regr, 
                        param_grid = log_args, 
                        cv = cv, 
                        verbose = 1, 
                        scoring = 'roc_auc')
result_lgr = grid_lgr.fit(x_train, y_train)
result_lgr.best_params_

In [None]:
log_regr = log_regr.set_params(**result_lgr.best_params_)
log_regr.fit(x_train, y_train)
prediction = log_regr.predict(x_test)

print(classification_report(y_test, prediction))
print(confusion_matrix(y_test, prediction))
print(f"ROC-AUC Score: {roc_auc_score(y_test, prediction)}")
print(f"Accuracy Score: {accuracy_score(y_test, prediction)}")

**Final Results**: 
* Accuracy Score (SVC): 97.902%
* Accuracy Score (LGR): 97.902%

Both the classification algorithms have the same accuracy score and very similar confusion matrices, with SVC showing more precision but insignificantly. 

* Notebook by Akshath Mangudi
* Inspiration and references drawn from: https://www.kaggle.com/code/gevorgakopyan/98-7-breast-cancer-dataset-svm-knn-randomforest
* **End of Notebook**