In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

In [3]:
## reading in the data from the data file
data = pd.read_csv('project4.data')

In [4]:
## treating "?" values in node-caps with "no" because that is the "mode" or most frequent value
data["node-caps"] = data["node-caps"].replace("?","no")

In [5]:
## treating "?" values in breast-quad with "left_low" because that is the "mode" or most frequent value
data["breast-quad"] = data["breast-quad"].replace("?","left_low")

In [6]:
## converting columns to category type for one-hot encoding
data['class'] = data['class'].astype("category")
data['menopause'] = data['menopause'].astype("category")
data['node-caps'] = data['node-caps'].astype("category")
data['breast'] = data['breast'].astype("category")
data['breast-quad'] = data['breast-quad'].astype("category")
data['irradiat'] = data['irradiat'].astype("category")

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   class        286 non-null    category
 1   age          286 non-null    object  
 2   menopause    286 non-null    category
 3   tumor-size   286 non-null    object  
 4   inv-nodes    286 non-null    object  
 5   node-caps    286 non-null    category
 6   deg-malig    286 non-null    int64   
 7   breast       286 non-null    category
 8   breast-quad  286 non-null    category
 9   irradiat     286 non-null    category
dtypes: category(6), int64(1), object(3)
memory usage: 11.6+ KB


In [8]:
data = pd.get_dummies(data, columns=["class", "menopause", "node-caps", 'breast','breast-quad','irradiat'], drop_first=True)

In [9]:
## doing data type conversion on age, tumor-size, and inv-nodes by replacing ranges with their midpoint
data["age"] = data["age"].replace({'20-29':'25', '30-39':'35','40-49':'45','50-59':'55','60-69':'65','70-79':'75'})
data["tumor-size"] = data["tumor-size"].replace({'0-4':'2', '5-9':'7','10-14':'12','15-19':'17','20-24':'22','25-29':'27', '30-34':'32','35-39':'37','40-44':'42','45-49':'47','50-54':'52'})
data["inv-nodes"] = data["inv-nodes"].replace({'0-2':'1', '3-5':'4','6-8':'7','9-11':'10','12-14':'13','15-17':'16','24-26':'25'})

In [10]:
## converting the columns to ints
data["age"] = data["age"].astype(int)
data["tumor-size"] = data["tumor-size"].astype(int)
data["inv-nodes"] = data["inv-nodes"].astype(int)

In [11]:
X = data.drop('class_recurrence-events', axis=1)
y = data['class_recurrence-events']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

In [12]:
clf = svm.SVC()
clf.fit(X_train,y_train)

In [13]:
accuracy_test = accuracy_score(y_test, clf.predict(X_test))
accuracy_test

0.6976744186046512

In [14]:
accuracy_train=accuracy_score(y_train, clf.predict(X_train))
accuracy_train

0.705

In [17]:
## attempting to improve accuracy using kernel functions
linear_svc = svm.SVC(kernel='linear')
linear_svc.fit(X_train,y_train)
linear_accuracy = accuracy_score(y_test, linear_svc.predict(X_test))
linear_train_accuracy = accuracy_score(y_test, linear_svc.predict(X_test))
linear_accuracy

0.6627906976744186

In [18]:
linear_train_accuracy

0.6627906976744186

In [19]:
poly_svc = svm.SVC(kernel='poly')
poly_svc.fit(X_train,y_train)
poly_accuracy = accuracy_score(y_test, poly_svc.predict(X_test))
poly_train_accuracy = accuracy_score(y_test, poly_svc.predict(X_test))
poly_accuracy

0.6976744186046512

In [20]:
poly_train_accuracy

0.6976744186046512

In [21]:
rbf_svc = svm.SVC(kernel='rbf')
rbf_svc.fit(X_train,y_train)
rbf_accuracy = accuracy_score(y_test, rbf_svc.predict(X_test))
rbf_train_accuracy = accuracy_score(y_test, rbf_svc.predict(X_test))
rbf_accuracy

0.6976744186046512

In [22]:
rbf_train_accuracy

0.6976744186046512

In [23]:
sig_svc = svm.SVC(kernel='sigmoid')
sig_svc.fit(X_train,y_train)
sig_accuracy = accuracy_score(y_test, sig_svc.predict(X_test))
sig_train_accuracy = accuracy_score(y_test, sig_svc.predict(X_test))
sig_accuracy

0.6976744186046512

In [24]:
sig_train_accuracy

0.6976744186046512