In [97]:
# Importing libraries
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [98]:
# Importing data

# Define the column names
column_names = ['seq_name', 'mcg', 'gvh', 'lip', 'chg', 'aac', 'alm1', 'alm2', 'class']
# Read the file
df = pd.read_csv('data/ecoli.data', sep='\s+', names=column_names)

# Display the DataFrame
print(df.info())
print(df.head())
# print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   seq_name  336 non-null    object 
 1   mcg       336 non-null    float64
 2   gvh       336 non-null    float64
 3   lip       336 non-null    float64
 4   chg       336 non-null    float64
 5   aac       336 non-null    float64
 6   alm1      336 non-null    float64
 7   alm2      336 non-null    float64
 8   class     336 non-null    object 
dtypes: float64(7), object(2)
memory usage: 23.8+ KB
None
     seq_name   mcg   gvh   lip  chg   aac  alm1  alm2 class
0   AAT_ECOLI  0.49  0.29  0.48  0.5  0.56  0.24  0.35    cp
1  ACEA_ECOLI  0.07  0.40  0.48  0.5  0.54  0.35  0.44    cp
2  ACEK_ECOLI  0.56  0.40  0.48  0.5  0.49  0.37  0.46    cp
3  ACKA_ECOLI  0.59  0.49  0.48  0.5  0.52  0.45  0.36    cp
4   ADI_ECOLI  0.23  0.32  0.48  0.5  0.55  0.25  0.35    cp


In [99]:
# Splitting the data into train and test sets
X = df.drop(['seq_name', 'class'], axis=1)
y = df['class']

X = X.astype(float)

print("BINARY CLASSIFICATION DATASET")
pass

# binary_X_train, binary_X_test, binary_y_train, binary_y_test = train_test_split(binary_X, binary_y, test_size=0.2, random_state=42)
# print('\nbinary_X_train shape: ', binary_X_train.shape)
# print('binary_y_train shape: ', binary_y_train.shape)
# print('binary_X_test shape: ', binary_X_test.shape)
# print('binary_y_test shape: ', binary_y_test.shape)

# print("binary_X_train:\n", binary_X_train.head())
# print("binary_y_train:\n", binary_y_train.head())
# print("binary_X_test:\n", binary_X_test.head())
# print("binary_y_test:\n", binary_y_test.head())

print("MULTI-CLASS CLASSIFICATION DATASET")
# MULTI-CLASS CLASSIFICATION DATASET
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('\nX_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

print("X_train:\n", X_train.head())
print("y_train:\n", y_train.head())
print("X_test:\n", X_test.head())
print("y_test:\n", y_test.head())



BINARY CLASSIFICATION DATASET
MULTI-CLASS CLASSIFICATION DATASET

X_train shape:  (268, 7)
y_train shape:  (268,)
X_test shape:  (68, 7)
y_test shape:  (68,)
X_train:
       mcg   gvh   lip  chg   aac  alm1  alm2
60   0.44  0.56  0.48  0.5  0.50  0.46  0.54
227  0.69  0.43  0.48  0.5  0.59  0.74  0.77
322  0.62  0.78  0.48  0.5  0.47  0.49  0.54
318  0.66  0.71  0.48  0.5  0.41  0.50  0.35
17   0.44  0.27  0.48  0.5  0.55  0.52  0.58
y_train:
 60      cp
227    imU
322     pp
318     pp
17      cp
Name: class, dtype: object
X_test:
       mcg   gvh   lip  chg   aac  alm1  alm2
72   0.37  0.50  0.48  0.5  0.42  0.36  0.45
110  0.32  0.33  0.48  0.5  0.60  0.06  0.20
298  0.64  0.72  0.48  0.5  0.49  0.42  0.19
108  0.49  0.43  0.48  0.5  0.49  0.30  0.40
277  0.71  0.71  0.48  0.5  0.68  0.43  0.36
y_test:
 72     cp
110    cp
298    pp
108    cp
277    om
Name: class, dtype: object


In [100]:
import numpy as np

class BinarySVM:
    def __init__(self, max_iter=1000, kernel_type='linear', C=1.0, epsilon=0.001, gamma=0.1, r=1):
        self.kernels = {
            'linear' : self.kernel_linear,
            'sigmoid': self.kernel_sigmoid,
            'rbf': self.kernel_rbf
        }
        self.max_iter = max_iter
        self.kernel_type = kernel_type
        self.C = C
        self.epsilon = epsilon
        self.gamma = gamma
        self.r = r

    def fit(self, X, y):
        # Initialization
        n, d = X.shape[0], X.shape[1]
        alpha = np.zeros((n))
        kernel = self.kernels[self.kernel_type]
        count = 0
        while True:
            count += 1
            alpha_prev = np.copy(alpha)
            for j in range(0, n):
                i = self.get_random_int(0, n-1, j) # Get random int i~=j
                x_i, x_j, y_i, y_j = X[i,:], X[j,:], y[i], y[j]
                k_ij = kernel(x_i, x_i) + kernel(x_j, x_j) - 2 * kernel(x_i, x_j)
                if k_ij == 0:
                    continue
                alpha_prime_j, alpha_prime_i = alpha[j], alpha[i]
                (L, H) = self.compute_L_H(self.C, alpha_prime_j, alpha_prime_i, y_j, y_i)

                # Compute model parameters
                self.w = self.calc_w(alpha, y, X)
                self.b = self.calc_b(X, y, self.w)

                # Compute E_i, E_j
                E_i = self.E(x_i, y_i, self.w, self.b)
                E_j = self.E(x_j, y_j, self.w, self.b)

                # Set new alpha values
                alpha[j] = alpha_prime_j + float(y_j * (E_i - E_j))/k_ij
                alpha[j] = max(alpha[j], L)
                alpha[j] = min(alpha[j], H)

                alpha[i] = alpha_prime_i + y_i*y_j * (alpha_prime_j - alpha[j])

            # Check convergence
            diff = np.linalg.norm(alpha - alpha_prev)
            if diff < self.epsilon:
                break

            if count >= self.max_iter:
                print("Iteration number exceeded the max of %d iterations" % (self.max_iter))
                return
        # Compute final model parameters
        self.b = self.calc_b(X, y, self.w)
        if self.kernel_type == 'linear':
            self.w = self.calc_w(alpha, y, X)
        # Get support vectors
        alpha_idx = np.where(alpha > 0)[0]
        support_vectors = X[alpha_idx, :]
        return support_vectors, count

    def predict(self, X):
        return self.h(X, self.w, self.b)

    def calc_b(self, X, y, w):
        b_tmp = y - np.dot(w.T, X.T)
        return np.mean(b_tmp)

    def calc_w(self, alpha, y, X):
        return np.dot(X.T, np.multiply(alpha,y))

    # Prediction
    def h(self, X, w, b):
        return np.sign(np.dot(w.T, X.T) + b).astype(int)

    # Prediction error
    def E(self, x_k, y_k, w, b):
        return self.h(x_k, w, b) - y_k

    def compute_L_H(self, C, alpha_prime_j, alpha_prime_i, y_j, y_i):
        if(y_i != y_j):
            return (max(0, alpha_prime_j - alpha_prime_i), min(C, C - alpha_prime_i + alpha_prime_j))
        else:
            return (max(0, alpha_prime_i + alpha_prime_j - C), min(C, alpha_prime_i + alpha_prime_j))

    def get_random_int(self, a, b, z):
        i = z
        cnt=0
        while i == z and cnt<1000:
            i = np.random.randint(a, b)
            cnt=cnt+1
        return i

    # Define kernels
    def kernel_linear(self, x1, x2):
        return np.dot(x1, x2.T)

    def kernel_sigmoid(self, x1, x2):
        return np.tanh(self.gamma * np.dot(x1, x2.T) + self.r)

    def kernel_rbf(self, x1, x2):
        distance = np.linalg.norm(x1 - x2) ** 2
        return np.exp(-self.gamma * distance)

class MultiClassSVM:
    def __init__(self, C=1.0, max_iter=1000, kernel_type='linear', epsilon=0.001, gamma=0.1, r=1):
        self.C = C
        self.max_iter = max_iter
        self.kernel_type = kernel_type
        self.epsilon = epsilon
        self.gamma = gamma
        self.r = r
        self.models = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        for c in self.classes:
            # Create a binary label for this class
            binary_y = (y == c).astype(int)
            binary_y[binary_y == 0] = -1
            # Train a binary SVM
            svm = BinarySVM(C=self.C, max_iter=self.max_iter, kernel_type=self.kernel_type, epsilon=self.epsilon, gamma=self.gamma, r=self.r)
            svm.fit(X, binary_y)
            # Save the trained model
            self.models.append(svm)

    def predict(self, X):
        # Get the prediction from each binary SVM
        predictions = np.zeros((X.shape[0], len(self.models)))
        for i, svm in enumerate(self.models):
            predictions[:, i] = svm.predict(X)
        # Choose the class that got the most votes
        return self.classes[np.argmax(predictions, axis=1)]

In [101]:
print(X_train)
print(y_train)


      mcg   gvh   lip  chg   aac  alm1  alm2
60   0.44  0.56  0.48  0.5  0.50  0.46  0.54
227  0.69  0.43  0.48  0.5  0.59  0.74  0.77
322  0.62  0.78  0.48  0.5  0.47  0.49  0.54
318  0.66  0.71  0.48  0.5  0.41  0.50  0.35
17   0.44  0.27  0.48  0.5  0.55  0.52  0.58
..    ...   ...   ...  ...   ...   ...   ...
188  0.12  0.43  0.48  0.5  0.63  0.70  0.74
71   0.44  0.51  0.48  0.5  0.47  0.26  0.36
106  0.44  0.49  0.48  0.5  0.39  0.38  0.40
270  0.56  0.68  0.48  0.5  0.77  0.36  0.45
102  0.38  0.26  0.48  0.5  0.54  0.16  0.28

[268 rows x 7 columns]
60      cp
227    imU
322     pp
318     pp
17      cp
      ... 
188     im
71      cp
106     cp
270     om
102     cp
Name: class, Length: 268, dtype: object


In [102]:
# Training Multi-Class SVM
print("Training Multi-Class SVM")
svm = MultiClassSVM(C=100, kernel_type='linear', gamma=1, epsilon=1)
start_time = time.time()
svm.fit(X_train.values, y_train.values)
end_time = time.time()

print("Training time: ", end_time - start_time)

# Testing Multi-Class SVM
print("Testing Multi-Class SVM")
y_pred = svm.predict(X_test.values)

# Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Training Multi-Class SVM
Training time:  0.6311080455780029
Testing Multi-Class SVM
Accuracy:  0.6617647058823529


In [103]:
# Testing against sklearn's SVM
from sklearn.svm import SVC
svc = SVC(kernel='linear', gamma=1, C=100)
start_time = time.time()
svc.fit(X_train.values, y_train.values)
end_time = time.time()
print("Training time: ", end_time - start_time)
y_pred = svc.predict(X_test.values)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Training time:  0.007405996322631836
Accuracy:  0.8529411764705882


In [104]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['sigmoid', 'rbf', 'linear'], 'tol': [0.0001, 0.001, 0.01, 0.1, 1]}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train, y_train)

print(grid.best_params_)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV] END .........C=0.1, gamma=1, kernel=sigmoid, tol=0.0001; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=sigmoid, tol=0.0001; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=sigmoid, tol=0.0001; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=sigmoid, tol=0.0001; total time=   0.0s
[CV] END .........C=0.1, gamma=1, kernel=sigmoid, tol=0.0001; total time=   0.0s
[CV] END ..........C=0.1, gamma=1, kernel=sigmoid, tol=0.001; total time=   0.0s
[CV] END ..........C=0.1, gamma=1, kernel=sigmoid, tol=0.001; total time=   0.0s
[CV] END ..........C=0.1, gamma=1, kernel=sigmoid, tol=0.001; total time=   0.0s
[CV] END ..........C=0.1, gamma=1, kernel=sigmoid, tol=0.001; total time=   0.0s
[CV] END ..........C=0.1, gamma=1, kernel=sigmoid, tol=0.001; total time=   0.0s
[CV] END ...........C=0.1, gamma=1, kernel=sigmoid, tol=0.01; total time=   0.0s
[CV] END ...........C=0.1, gamma=1, kernel=si



[CV] END .........C=0.1, gamma=0.1, kernel=sigmoid, tol=0.01; total time=   0.0s
[CV] END ..........C=0.1, gamma=0.1, kernel=sigmoid, tol=0.1; total time=   0.0s
[CV] END ..........C=0.1, gamma=0.1, kernel=sigmoid, tol=0.1; total time=   0.0s
[CV] END ..........C=0.1, gamma=0.1, kernel=sigmoid, tol=0.1; total time=   0.0s
[CV] END ..........C=0.1, gamma=0.1, kernel=sigmoid, tol=0.1; total time=   0.0s
[CV] END ..........C=0.1, gamma=0.1, kernel=sigmoid, tol=0.1; total time=   0.0s
[CV] END ............C=0.1, gamma=0.1, kernel=sigmoid, tol=1; total time=   0.0s
[CV] END ............C=0.1, gamma=0.1, kernel=sigmoid, tol=1; total time=   0.0s
[CV] END ............C=0.1, gamma=0.1, kernel=sigmoid, tol=1; total time=   0.0s
[CV] END ............C=0.1, gamma=0.1, kernel=sigmoid, tol=1; total time=   0.0s
[CV] END ............C=0.1, gamma=0.1, kernel=sigmoid, tol=1; total time=   0.0s
[CV] END ...........C=0.1, gamma=0.1, kernel=rbf, tol=0.0001; total time=   0.0s
[CV] END ...........C=0.1, g