## Import Data

In [156]:
import pandas as pd
import numpy as np

In [157]:
# use pandas to load data into a DataFrame
df = pd.read_csv("mf_df2_breast_cancer.csv")
df.shape # (rows, columns)

(699, 11)

In [158]:
df.head()

Unnamed: 0,Id,Clump_thickness,Uniformity_cell_size,Uniformity_cell_shape,Marginal_adhesion,Single_e_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2


In [159]:
target_column = 'Class'

In [160]:
df[target_column].replace(to_replace={2:0, 4:1}, inplace=True)

In [161]:
df.head()

Unnamed: 0,Id,Clump_thickness,Uniformity_cell_size,Uniformity_cell_shape,Marginal_adhesion,Single_e_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0
2,1015425,3,1,1,1,2,2.0,3,1,1,0
3,1016277,6,8,8,1,3,4.0,3,7,1,0
4,1017023,4,1,1,3,2,1.0,3,1,1,0


In [162]:
y_train = np.array(df[target_column]) # value dari label
df = df.drop(target_column, axis=1) # drop target
feature_list = list(df.columns) # list column
X_train = np.array(df) # df dalam bentuk numpy array

### Normalization

In [163]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# X_train_norm = scaler.fit_transform(X_train)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

## Split 8:2

In [164]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.2
RANDOM_SEED = 42

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train_norm, y_train, test_size = TEST_SIZE, random_state = RANDOM_SEED)

In [165]:
train = pd.DataFrame(X_train)
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.117203,0.206936,-0.372048,-0.743299,-0.633247,-0.549561,-0.697334,-0.179662,-0.611825,-0.343912
1,0.059588,0.206936,-0.044102,-0.069850,0.418154,-0.549561,0.129764,-0.179662,0.371315,-0.343912
2,0.247505,-1.214667,-0.699995,-0.743299,-0.282780,-0.549561,-0.697334,-0.179662,-0.611825,-0.343912
3,0.405782,0.206936,-0.699995,-0.743299,-0.633247,-0.549561,-0.697334,-0.590067,-0.611825,-0.343912
4,0.186866,-0.859266,-0.699995,-0.069850,-0.282780,-0.549561,-0.697334,-0.590067,-0.611825,-0.343912
...,...,...,...,...,...,...,...,...,...,...
554,0.083283,0.562336,2.251526,-0.406574,1.820022,3.065906,-0.421634,1.461957,1.682167,4.907421
555,0.160081,1.983939,2.251526,2.287222,1.820022,-0.549561,1.783960,0.230743,-0.611825,-0.343912
556,-1.104126,1.273138,0.283845,1.277048,-0.633247,-0.097628,1.783960,-0.179662,2.009880,0.239570
557,0.361399,1.983939,1.595632,2.287222,-0.633247,-0.097628,1.783960,0.641147,-0.611825,-0.343912


In [166]:
test = pd.DataFrame(X_test)
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.197808,-1.214667,-0.699995,-0.406574,-0.633247,-0.097628,-0.145935,-1.000471,-0.611825,-0.343912
1,0.237164,-0.148465,-0.699995,-0.743299,-0.633247,-0.549561,-0.697334,-0.590067,-0.611825,-0.343912
2,0.169435,-0.503866,-0.699995,-0.743299,-0.633247,-0.549561,-0.697334,-0.179662,-0.611825,-0.343912
3,0.195605,0.206936,0.611792,0.603599,1.119088,-0.097628,1.783960,-0.179662,-0.611825,-0.343912
4,-0.548225,-0.503866,-0.699995,-0.743299,-0.633247,-0.549561,-0.697334,-0.179662,-0.611825,-0.343912
...,...,...,...,...,...,...,...,...,...,...
135,-0.019707,-1.214667,-0.699995,-0.743299,-0.633247,-0.549561,-0.697334,-0.179662,-0.611825,-0.343912
136,0.246636,0.562336,2.251526,1.277048,1.469555,1.258172,0.129764,1.872361,2.337594,0.239570
137,0.331445,0.206936,-0.699995,-0.743299,0.067687,0.354305,-0.697334,-0.179662,-0.284112,-0.343912
138,-0.948736,-0.859266,-0.699995,-0.743299,-0.282780,-0.549561,-0.697334,-0.179662,-0.611825,-0.343912


In [167]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# model = SVC(kernel='sigmoid')
model = SVC()
model.fit(X_train,y_train)

predictions = model.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        95
           1       0.96      0.96      0.96        45

    accuracy                           0.97       140
   macro avg       0.97      0.97      0.97       140
weighted avg       0.97      0.97      0.97       140



In [180]:
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
# param_grid = {'C': [0.1, 1, 10, 100, 1000], 
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf','sigmoid','poly','linear']} 
param_grid = {'C': [0.009], 
              'gamma': [1.2],
              'kernel': ['sigmoid']} 

# best: {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv=5)
  
# fitting the model for grid search
grid.fit(X_train, y_train)
# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.973 total time=   0.0s
[CV 2/5] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.973 total time=   0.0s
[CV 3/5] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.982 total time=   0.0s
[CV 4/5] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.964 total time=   0.0s
[CV 5/5] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.937 total time=   0.0s
{'C': 0.009, 'gamma': 1.2, 'kernel': 'sigmoid'}
SVC(C=0.009, gamma=1.2, kernel='sigmoid')


In [169]:
grid_predictions = grid.predict(X_test)
  
# print classification report
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.68      1.00      0.81        95
           1       0.00      0.00      0.00        45

    accuracy                           0.68       140
   macro avg       0.34      0.50      0.40       140
weighted avg       0.46      0.68      0.55       140



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Cross Validation

In [170]:
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [171]:
FOLD = 15
ITERATIONS = 5

In [172]:
# param_grid = {'C': [0.009], 
            #   'gamma': [1.2],
            #   'kernel': ['rbf']}

model = SVC(C=0.009, gamma=1, kernel='rbf')
# cv = RepeatedStratifiedKFold(n_splits=FOLD, n_repeats=ITERATIONS, random_state=RANDOM_SEED)
n_scores = cross_val_score(model, X_train, y_train, scoring='f1_macro', cv=FOLD, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.394 (0.002)


In [173]:
print(cross_val_score(model, X_train, y_train, scoring="f1", cv = 5))
mean_score = cross_val_score(model, X_train, y_train, scoring="f1", cv = 5).mean()
std_score = cross_val_score(model, X_train, y_train, scoring="f1", cv = 5).std()
print(mean_score)
print(std_score)

[0. 0. 0. 0. 0.]
0.0
0.0


In [181]:
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv=3)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.984 total time=   0.0s
[CV 2/3] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.968 total time=   0.0s
[CV 3/3] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.946 total time=   0.0s


In [182]:
preds = grid.predict(X_test)

In [183]:
print(classification_report(y_test, preds, digits=4))

              precision    recall  f1-score   support

           0     0.9787    0.9684    0.9735        95
           1     0.9348    0.9556    0.9451        45

    accuracy                         0.9643       140
   macro avg     0.9568    0.9620    0.9593       140
weighted avg     0.9646    0.9643    0.9644       140



In [177]:
print(classification_report(y_test, preds, digits=4))

              precision    recall  f1-score   support

           0     0.6786    1.0000    0.8085        95
           1     0.0000    0.0000    0.0000        45

    accuracy                         0.6786       140
   macro avg     0.3393    0.5000    0.4043       140
weighted avg     0.4605    0.6786    0.5486       140



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [178]:
predictions = pd.DataFrame(preds)
predictions.to_csv('result.csv', index=False)
predictions

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
135,0
136,0
137,0
138,0


In [None]:
from sklearn.naive_bayes import BernoulliNB

BernoulliNB()