## Import Data

In [1]:
import pandas as pd
import numpy as np

In [4]:
# use pandas to load data into a DataFrame
df = pd.read_csv("/home/danielbudi/Collage/comp-gammafest/datasets/iris.data", names=['sl', 'sw', 'pl', 'pw', 'class'])
df.shape # (rows, columns)

(150, 5)

In [5]:
df.head()

Unnamed: 0,sl,sw,pl,pw,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
target_column = 'class'

In [7]:
df[target_column].replace(to_replace={'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}, inplace=True)

In [8]:
df.head()

Unnamed: 0,sl,sw,pl,pw,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [9]:
y_train = np.array(df[target_column]) # value dari label
df = df.drop(target_column, axis=1) # drop target
feature_list = list(df.columns) # list column
X_train = np.array(df) # df dalam bentuk numpy array

### Normalization

In [10]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# X_train_norm = scaler.fit_transform(X_train)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

## Split 8:2

In [11]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.2
RANDOM_SEED = 42

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train_norm, y_train, test_size = TEST_SIZE, random_state = RANDOM_SEED)

In [12]:
train = pd.DataFrame(X_train)
train

Unnamed: 0,0,1,2,3
0,-1.506521,1.263460,-1.568735,-1.312977
1,-0.173674,3.114684,-1.284407,-1.050031
2,1.038005,0.106445,0.364699,0.264699
3,-1.264185,0.800654,-1.227541,-1.312977
4,-1.748856,0.337848,-1.398138,-1.312977
...,...,...,...,...
115,0.310998,-0.587764,0.137236,0.133226
116,-1.143017,-1.281972,0.421564,0.659118
117,-0.052506,2.189072,-1.455004,-1.312977
118,-0.052506,-1.050569,0.137236,0.001753


In [13]:
test = pd.DataFrame(X_test)
test

Unnamed: 0,0,1,2,3
0,0.310998,-0.587764,0.535296,0.001753
1,-0.173674,1.726266,-1.170675,-1.181504
2,2.249683,-1.050569,1.786341,1.447956
3,0.18983,-0.356361,0.421564,0.396172
4,1.159173,-0.587764,0.592162,0.264699
5,-0.537178,0.800654,-1.284407,-1.050031
6,-0.294842,-0.356361,-0.090227,0.133226
7,1.28034,0.106445,0.762759,1.447956
8,0.432165,-1.976181,0.421564,0.396172
9,-0.052506,-0.819166,0.08037,0.001753


In [16]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# model = SVC(kernel='sigmoid')
model = SVC(kernel='rbf', C=0.1, gamma=1)
model.fit(X_train,y_train)

predictions = model.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [180]:
from sklearn.model_selection import GridSearchCV
  
# defining parameter range
# param_grid = {'C': [0.1, 1, 10, 100, 1000], 
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf','sigmoid','poly','linear']} 
param_grid = {'C': [0.009], 
              'gamma': [1.2],
              'kernel': ['sigmoid']} 

# best: {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv=5)
  
# fitting the model for grid search
grid.fit(X_train, y_train)
# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.973 total time=   0.0s
[CV 2/5] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.973 total time=   0.0s
[CV 3/5] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.982 total time=   0.0s
[CV 4/5] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.964 total time=   0.0s
[CV 5/5] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.937 total time=   0.0s
{'C': 0.009, 'gamma': 1.2, 'kernel': 'sigmoid'}
SVC(C=0.009, gamma=1.2, kernel='sigmoid')


In [169]:
grid_predictions = grid.predict(X_test)
  
# print classification report
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.68      1.00      0.81        95
           1       0.00      0.00      0.00        45

    accuracy                           0.68       140
   macro avg       0.34      0.50      0.40       140
weighted avg       0.46      0.68      0.55       140



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Cross Validation

In [170]:
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [171]:
FOLD = 15
ITERATIONS = 5

In [172]:
# param_grid = {'C': [0.009], 
            #   'gamma': [1.2],
            #   'kernel': ['rbf']}

model = SVC(C=0.009, gamma=1, kernel='rbf')
# cv = RepeatedStratifiedKFold(n_splits=FOLD, n_repeats=ITERATIONS, random_state=RANDOM_SEED)
n_scores = cross_val_score(model, X_train, y_train, scoring='f1_macro', cv=FOLD, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.394 (0.002)


In [173]:
print(cross_val_score(model, X_train, y_train, scoring="f1", cv = 5))
mean_score = cross_val_score(model, X_train, y_train, scoring="f1", cv = 5).mean()
std_score = cross_val_score(model, X_train, y_train, scoring="f1", cv = 5).std()
print(mean_score)
print(std_score)

[0. 0. 0. 0. 0.]
0.0
0.0


In [181]:
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv=3)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.984 total time=   0.0s
[CV 2/3] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.968 total time=   0.0s
[CV 3/3] END C=0.009, gamma=1.2, kernel=sigmoid;, score=0.946 total time=   0.0s


In [182]:
preds = grid.predict(X_test)

In [183]:
print(classification_report(y_test, preds, digits=4))

              precision    recall  f1-score   support

           0     0.9787    0.9684    0.9735        95
           1     0.9348    0.9556    0.9451        45

    accuracy                         0.9643       140
   macro avg     0.9568    0.9620    0.9593       140
weighted avg     0.9646    0.9643    0.9644       140



In [177]:
print(classification_report(y_test, preds, digits=4))

              precision    recall  f1-score   support

           0     0.6786    1.0000    0.8085        95
           1     0.0000    0.0000    0.0000        45

    accuracy                         0.6786       140
   macro avg     0.3393    0.5000    0.4043       140
weighted avg     0.4605    0.6786    0.5486       140



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [178]:
predictions = pd.DataFrame(preds)
predictions.to_csv('result.csv', index=False)
predictions

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
135,0
136,0
137,0
138,0


In [None]:
from sklearn.naive_bayes import BernoulliNB

BernoulliNB()