In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
dataset = pd.read_csv('breastdata.csv',names=['id','thickness','size_uniformity',
                                                            'shape_uniformity','adhesion','cellsize',
                                                            'nuclei','chromatin','nucleoli','mitoses',
                                                            'type'])
dataset = dataset.drop('id',axis=1)

In [3]:
dataset.loc[dataset['nuclei']=='?','nuclei'] = np.nan
dataset = dataset.dropna()
dataset['nuclei'] = dataset['nuclei'].astype('int')

In [4]:
output_array = []

In [5]:
X = dataset.drop('type',axis = 1)
y= dataset['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,stratify = y)

In [6]:
y_test.describe()

count    274.000000
mean       2.700730
std        0.955914
min        2.000000
25%        2.000000
50%        2.000000
75%        4.000000
max        4.000000
Name: type, dtype: float64

In [7]:
param_grid = {'C' : [0.1, 0.5, 1, 5, 10, 50, 100]} 
grid = GridSearchCV(SVC(kernel='linear'),param_grid,refit=True)
grid.fit(X_train,y_train)
train_acc = grid.score(X_train,y_train)
test_acc = grid.score(X_test, y_test)
out_arr = ['svm_linear', train_acc, test_acc]
output_array.append(out_arr)



In [8]:
param_grid = {'C' : [0.1, 1, 3], 'gamma' : [0.1, 0.5], 'degree' : [4, 5, 6]} 
grid = GridSearchCV(SVC(kernel='poly'),param_grid,refit=True)
grid.fit(X_train,y_train)
train_acc = grid.score(X_train,y_train)
test_acc = grid.score(X_test, y_test)
out_arr = ['svm_polynomial', train_acc, test_acc]
output_array.append(out_arr)



In [9]:
param_grid = {'C':[0.1, 0.5, 1, 5, 10, 50, 100], 'gamma':[0.1, 0.5, 1, 3, 6, 10]} 
grid = GridSearchCV(SVC(kernel = 'rbf'),param_grid,refit=True)
grid.fit(X_train,y_train)
train_acc = grid.score(X_train,y_train)
test_acc = grid.score(X_test, y_test)
out_arr = ['svm_rbf', train_acc, test_acc]
output_array.append(out_arr)



In [10]:
#Logistic regression
svc = LogisticRegression()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train,y_train)
train_acc = clf.score(X_train,y_train)
test_acc = clf.score(X_test, y_test)
out_arr = ['logistic', train_acc, test_acc]
output_array.append(out_arr)



In [11]:
x = list(range(1,51))
parameters = {'n_neighbors': x, "leaf_size":list(range(5,61,5))}
svc = KNeighborsClassifier()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train,y_train)
train_acc = clf.score(X_train,y_train)
test_acc = clf.score(X_test, y_test)
out_arr = ['knn', train_acc, test_acc]
output_array.append(out_arr)



In [12]:
#Decision trees
parameters = {'max_depth':x, "min_samples_split":[2,3,4,5,6,7,8,9,10]}
svc = DecisionTreeClassifier()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train,y_train)
train_acc = clf.score(X_train,y_train)
test_acc = clf.score(X_test, y_test)
out_arr = ['decision_tree', train_acc, test_acc]
output_array.append(out_arr)



In [13]:
#Random Forest
parameters = {'max_depth':x, 'min_samples_split':[2,3,4,5,6,7,8,9,10]}
svc = RandomForestClassifier()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train,y_train)
train_acc = clf.score(X_train,y_train)
test_acc = clf.score(X_test, y_test)
out_arr = ['random_forest', train_acc, test_acc]
output_array.append(out_arr)







































































In [14]:
output_array

[['svm_linear', 0.9706601466992665, 0.9781021897810219],
 ['svm_polynomial', 1.0, 0.9416058394160584],
 ['svm_rbf', 1.0, 0.9708029197080292],
 ['logistic', 0.9682151589242054, 0.9671532846715328],
 ['knn', 0.9779951100244498, 0.9817518248175182],
 ['decision_tree', 1.0, 0.9708029197080292],
 ['random_forest', 1.0, 0.9708029197080292]]