<a href="https://colab.research.google.com/github/buingohoanglong/SVM-kNN-DecisionTree/blob/main/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load data

In [555]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [556]:
!ls

drive  sample_data


In [557]:
import pandas as pd

In [558]:
dataset = pd.read_csv('drive/MyDrive/desktop/ADA/extra/spam_ham_dataset.csv')

In [559]:
dataset

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [560]:
data_X = dataset['text']
data_y = dataset['label_num']

In [None]:
data_X

In [None]:
data_y

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [562]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [563]:
def text_cleaner(text):
    text = text.lower() # lower case text
    text = re.sub(r"'s\b","",text)  # remove 's at the end of each word
    # remove punctuations
    text = re.sub("[^a-zA-Z]", " ", text) # how about numbers ???
    return text

In [564]:
def preprocess(sentence):
  sentence = text_cleaner(sentence)
  tokens = word_tokenize(sentence)
  stop_words = set(stopwords.words('english'))
  tokens = [tk for tk in tokens if not tk in stop_words]
  return tokens

In [565]:
sentences = [preprocess(s) for s in data_X]

In [566]:
labels = [l if l == 1 else -1 for l in data_y]

# Load google pretrained word2vec

In [None]:
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)

In [None]:
print(path)

/root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format(path, binary=True)

In [None]:
word_vectors['facebook'].shape

(300,)

# Prepare data

In [567]:
import numpy as np

In [568]:
EMBEDDING_DIM = 300

In [569]:
X = np.zeros((len(sentences),EMBEDDING_DIM))

In [570]:
index = 0
for sentence in sentences:
  for token in sentence:
    try:
      embedding_vector = word_vectors[token]
      X[index] += embedding_vector
    except KeyError:
      X[index] += np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)
  index += 1

In [None]:
print(X[0])

In [571]:
from sklearn.model_selection import train_test_split

In [572]:
X_train, X_test, y_train, y_test = train_test_split(X, np.array(labels))
X_train = X_train.T
X_test = X_test.T
y_train = y_train.reshape(1, y_train.shape[0])
y_test = y_test.reshape(1, y_test.shape[0])

In [573]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(300, 3878)
(1, 3878)
(300, 1293)
(1, 1293)


# Soft Margin SVM model

In [574]:
class SVM:
  def __init__(self):
    self.weights = None

  def calculate_accurracy(self, predicts, y):
    f = np.frompyfunc(lambda x: x if x == 1 else 0,1,1)
    num_correct_predicts = np.sum(f(predicts*y))
    num_predicts = y.shape[1]
    acc = num_correct_predicts / num_predicts
    return acc

  def has_positive_hinge_loss(self, w_bar, x_bar, y):
    f = np.frompyfunc(lambda x: 1 if x > 0 else 0, 1, 1)
    return f(1 - y*w_bar.T.dot(x_bar))

  def fit(self, X, y, learning_rate=0.001, ld=1, loops=1000):
    X_bar = np.concatenate((X, np.ones((1, X.shape[1]))), axis=0) # extend
    w_bar = np.ones((X_bar.shape[0], 1)) # extend

    for i in range(loops):
      if i%100 == 0:
        predicts, acc = svm_predict(X,y,w_bar)
        print(f"Acc: {acc}")
        
      temp = has_positive_hinge_loss(w_bar, X_bar, y)
      regularization_gradient = ld * np.concatenate((w_bar[:-1], np.array([[0]])), axis=0)
      hinge_gradient = np.sum(-temp*y*X_bar, axis=1).reshape(X_bar.shape[0], 1)
      w_bar = w_bar - learning_rate * ( hinge_gradient + regularization_gradient )

    self.weights = w_bar

  def predict(self, X, y=None):
    X_bar = np.concatenate((X, np.ones((1, X.shape[1]))), axis=0) # extend
    predicts = np.sign(self.weights.T.dot(X_bar))

    if y is not None: # used for test purpose
      # calculate accurracy
      acc = self.calculate_accurracy(predicts, y)
      return predicts, acc
    else:
      return predicts

  def save(self, file_name):
    with open(file_name, 'wb') as f:
      np.save(f, self.weights)

  def load(self, file_name):
    with open(file_name, 'rb') as f:
      self.weights = np.load(f, allow_pickle=True)
  

In [575]:
svm_model = SVM()
svm_model.fit(X_train, y_train)

Acc: 0.6330582774626096
Acc: 0.9203197524497163
Acc: 0.7926766374419804
Acc: 0.9195461578133058
Acc: 0.944043321299639
Acc: 0.9154203197524498
Acc: 0.9342444559051057
Acc: 0.9406910778751933
Acc: 0.9448169159360496
Acc: 0.9448169159360496


In [576]:
svm_model.save('drive/MyDrive/desktop/ADA/extra/weights.npy')

In [577]:
svm_model.load('drive/MyDrive/desktop/ADA/extra/weights.npy')

In [578]:
predicts, acc = svm_model.predict(X_test, y_test)

In [579]:
acc

0.925754060324826

# kNN model

In [580]:
class kNN:
  def __init__(self):
    self.X_train = None
    self.y_train = None

  def calculate_accurracy(self, predicts, y):
    f = np.frompyfunc(lambda x: x if x == 1 else 0,1,1)
    num_correct_predicts = np.sum(f(predicts*y))
    num_predicts = y.shape[1]
    acc = num_correct_predicts / num_predicts
    return acc
  
  def euclidean_distance(self, x, X):
    return np.sum((x - X)**2, axis=0)

  def voting(self, k_nearest, distances, labels):
    scores = np.zeros((2,1))
    for neighbour in k_nearest:
      d = distances[neighbour]
      if d == 0:
        return labels[:,neighbour]

      if labels[:,neighbour] == 1:
        scores[0] += 1/d
      else:
        scores[1] += 1/d

    return 1 if scores[0] > scores[1] else -1

  def fit(self, X, y):
    self.X = X
    self.y = y

  def predict(self, X_test, y_test=None, k=10):
    predicts = np.zeros((1, y_test.shape[1]))
    for i in range(X_test.shape[1]):
      x = X_test[:,i].reshape((X_test.shape[0], 1))
      distances = euclidean_distance(x, self.X)
      k_nearest = np.argpartition(distances, k)
      predicts[:,i] = voting(k_nearest, distances, self.y)

    if y_test is not None:  # used for test purpose
      acc = self.calculate_accurracy(predicts, y_test)
      return predicts, acc
    else:
      return predicts
  

In [581]:
knn_model = kNN()
knn_model.fit(X_train, y_train)
predicts, acc = knn_model.predict(X_test, y_test)

In [582]:
acc

0.7146171693735499

# Decision Tree model

In [583]:
class Node:
  def __init__(self, ids, depth):
    self.ids = ids # index of datapoint in dataset
    self.entropy = None

    # leaf node attribute
    self.label = None

    # internal node attribute
    self.split_attribute = None # index of attribute to split at this node
    self.split_value = None # value to be split at self.slit_attribute
    self.children = []  # list of child nodes (left and right)

    self.depth = depth

  def set_state(self, split_attribute, split_value, children):
    self.split_attribute = split_attribute
    self.split_value = split_value
    self.children = children

In [584]:
from functools import *
import math

class DecisionTree:
  def __init__(self, max_depth):
    self.X = [] # list of data point, each data point is a list of attributes (write in row)
    self.y = []  # list of labels

    self.root = None

    self.max_depth = max_depth

  def entropy(self, node):
    positive_count = 0
    negative_count = 0
    for index in node.ids:
      if self.y[index] == 1:
        positive_count += 1
      else:
        negative_count += 1
    
    positive_prob = positive_count/len(node.ids)
    negative_prob = negative_count/len(node.ids)

    if positive_count == len(node.ids) or negative_count == len(node.ids):
      node.entropy = 0
    else:
      node.entropy = -(positive_prob*math.log(positive_prob) + negative_prob*math.log(negative_prob))

    return node.entropy

  def set_label(self, node):
    node.label = 1 if sum([self.y[index] for index in node.ids]) >= 0 else -1


  def split(self, node, attribute):
    split_value = None
    best_information_gain = 0
    children = []

    split_value_set = set(list(filter(lambda index: self.X[index][attribute], node.ids)))
    for value in split_value_set:
      left_child = Node(ids=list(filter(lambda index: self.X[index][attribute] < value, node.ids)), depth=node.depth+1)
      right_child = Node(ids=list(filter(lambda index: self.X[index][attribute] >= value, node.ids)), depth=node.depth+1)

      if len(left_child.ids) == len(node.ids) or len(right_child.ids) == len(node.ids):
        information_gain = 0
      else:
        information_gain = node.entropy - (len(left_child.ids)*self.entropy(left_child) + len(right_child.ids)*self.entropy(right_child)) / len(node.ids)

      if information_gain > best_information_gain:
        best_information_gain = information_gain
        children = [left_child, right_child]
        split_value = value

    return best_information_gain, attribute, split_value, children



  def split_node(self, node):
    best_information_gain = 0
    split_attribute = None
    split_value = None
    children = []

    for attribute in range(len(self.X[0])):
      information_gain, current_split_attribute, current_split_value, current_children = self.split(node, attribute)
      if information_gain > best_information_gain:
        best_information_gain = information_gain
        split_attribute = current_split_attribute
        split_value = current_split_value
        children = current_children

    node.set_state(split_attribute, split_value, children)


  def build_tree(self):
    # build tree in bfs manner
    queue = [self.root]
    while len(queue) != 0:
      node = queue.pop(0)
      if self.entropy(node) == 0 or node.depth >= self.max_depth: # leaf node
        self.set_label(node)
      else: # internal node
        self.split_node(node)
        if node.children == []:
          self.set_label(node)
        else:
          for child in node.children:
            queue.append(child)


  def fit(self, X, y):
    self.X = X.T.tolist()
    self.y = y.tolist()[0]

    ids = list(range(len(self.y)))
    self.root = Node(ids=ids, depth=0)
    
    self.build_tree()

  def __predict(self, x, node):
    if node.label is not None:
      return node.label

    if x[node.split_attribute] >= node.split_value:
      return self.__predict(x, node.children[1])
    else:
      return self.__predict(x, node.children[0])


  def predict(self, x):
    return self.__predict(x, self.root)

  def test(self, X, y):
    X_test = X.T.tolist()
    predicts = np.zeros(y.shape)

    for index in range(len(X_test)):
      x = X_test[index]
      predicts[0, index] = self.predict(x)

    # calculate accurracy
    f = np.frompyfunc(lambda x: x if x == 1 else 0,1,1)
    num_correct_predicts = np.sum(f(predicts*y))
    num_predicts = y.shape[1]
    acc = num_correct_predicts / num_predicts

    return predicts, acc

        



# Test Decision Tree model with banknote authentication dataset

In [585]:
banknote_dataset = pd.read_csv('drive/MyDrive/desktop/ADA/extra/data_banknote_authentication.txt', delimiter = ",", header=None)

In [586]:
banknote_dataset

Unnamed: 0,0,1,2,3,4
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1
1368,-1.38870,-4.87730,6.4774,0.34179,1
1369,-3.75030,-13.45860,17.5932,-2.77710,1
1370,-3.56370,-8.38270,12.3930,-1.28230,1


In [587]:
banknote_X = banknote_dataset.iloc[:,:-1].to_numpy()
banknote_y = banknote_dataset.iloc[:,-1].to_numpy()
banknote_y = np.where(banknote_y == 0, -1, banknote_y)

In [588]:
banknote_X_train, banknote_X_test, banknote_y_train, banknote_y_test = train_test_split(banknote_X, banknote_y)
banknote_X_train = banknote_X_train.T
banknote_X_test = banknote_X_test.T
banknote_y_train = banknote_y_train.reshape(1, banknote_y_train.shape[0])
banknote_y_test = banknote_y_test.reshape(1, banknote_y_test.shape[0])

In [589]:
print(banknote_X_train.shape)
print(banknote_y_train.shape)
print(banknote_X_test.shape)
print(banknote_y_test.shape)

(4, 1029)
(1, 1029)
(4, 343)
(1, 343)


In [590]:
dt_model = DecisionTree(max_depth=10)
dt_model.fit(banknote_X_train, banknote_y_train)

In [591]:
predicts, acc = dt_model.test(banknote_X_test, banknote_y_test)

In [592]:
acc

0.8892128279883382

In [593]:
model = SVM()
model.fit(banknote_X_train, banknote_y_train)

Acc: 0.15743440233236153
Acc: 0.9883381924198251
Acc: 0.9883381924198251
Acc: 0.9912536443148688
Acc: 0.9912536443148688
Acc: 0.989310009718173
Acc: 0.9902818270165209
Acc: 0.9902818270165209
Acc: 0.9883381924198251
Acc: 0.9883381924198251


In [594]:
predicts, acc = model.predict(banknote_X_test, banknote_y_test)

In [595]:
acc

0.9825072886297376

In [596]:
model = kNN()
model.fit(banknote_X_train, banknote_y_train)
predicts, acc = model.predict(banknote_X_test, banknote_y_test)

In [597]:
acc

0.9970845481049563