### Decision Tree classification

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [124]:
playTennis_df=pd.read_csv("C:/Users/hp/Downloads/PlayTennis.csv")
playTennis_df.head(15)

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [3]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
playTennis_df['outlook'] = label_encoder.fit_transform(playTennis_df['outlook'])
playTennis_df['temp'] = label_encoder.fit_transform(playTennis_df['temp'])
playTennis_df['humidity'] = label_encoder.fit_transform(playTennis_df['humidity'])
playTennis_df['windy'] = label_encoder.fit_transform(playTennis_df['windy'])
playTennis_df['play'] = label_encoder.fit_transform(playTennis_df['play'])

In [4]:
playTennis_df.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,2,1,0,0,0
1,2,1,0,1,0
2,0,1,0,0,1
3,1,2,0,0,1
4,1,0,1,0,1


In [5]:
print(playTennis_df['outlook'].unique())
print(playTennis_df['temp'].unique())
print(playTennis_df['humidity'].unique())
print(playTennis_df['windy'].unique())
print(playTennis_df['play'].unique())

[2 0 1]
[1 2 0]
[0 1]
[0 1]
[0 1]


In [159]:
class Tree_node:
    def __init__(self, feature_index=None, threshold=None, left_subtree=None, right_subtree=None, info_gain=None, value=None):
         
        
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left_subtree
        self.right = right_subtree
        self.info_gain = info_gain
        
        # for leaf node
        self.value = value

In [160]:
class Classifier:
    
    def __init__(self, min_samples_split=3, max_depth=2):   
        self.root = None        
        
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
        
    def split(self, dataset, feature_index, threshold):
               
        left = np.array([row for row in dataset if row[feature_index]<=threshold])
        right = np.array([row for row in dataset if row[feature_index]>threshold])
        
        return left, right
    
    
    def gini_index(self, y):
        class_labels = np.unique(y)
        gini = 0
        for elem in class_labels:
            p_elem = len(y[y == elem]) / len(y)
            gini += p_elem**2
        return 1 - gini
    
    
    def entropy(self, y):
        class_labels = np.unique(y)
        entropy = 0
        for elem in class_labels:
            p_elem = len(y[y == elem]) / len(y)
            entropy += -p_elem * np.log2(p_elem)
     
        return entropy
    
    
    def information_gain(self, parent, left_child, right_child, mode="entropy"):
        weight_l = len(left_child) / len(parent)
        weight_r = len(right_child) / len(parent)
        if mode=="gini":
            gain = self.gini_index(parent) - (weight_l*self.gini_index(left_child) + weight_r*self.gini_index(right_child))
        else:
            gain = self.entropy(parent) - (weight_l*self.entropy(left_child) + weight_r*self.entropy(right_child))
        return gain
    
    
    def find_best_split(self, dataset, num_samples, num_features):
        best_split = {}
        max_info_gain = -float("inf")
        
        for feature_index in range(num_features):
            feature_values = dataset[:,feature_index]
            unique_thresholds = np.unique(feature_values)
            for threshold in unique_thresholds:
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, y_left, y_right = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    cur_info_gain = self.information_gain(y, y_left, y_right, "gini")
                    if cur_info_gain>max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = cur_info_gain
                        max_info_gain = cur_info_gain
                        
        return best_split
    
    def find_leaf_value(self, Y):
        Y = list(Y)
        return max(Y, key=Y.count)
        
    def create_tree(self, dataset, cur_depth=0):
        X= dataset[:,:-1]
        Y= dataset[:,-1]
        num_samples, num_features = np.shape(X)       
        
        if num_samples>=self.min_samples_split and cur_depth<=self.max_depth:            
            best_split = self.find_best_split(dataset, num_samples, num_features)            
            if best_split["info_gain"]>0:
                left_subtree = self.create_tree(best_split["dataset_left"],cur_depth+1)                
                right_subtree = self.create_tree(best_split["dataset_right"], cur_depth+1)
                return Tree_node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["info_gain"])
        
        leaf_value = self.find_leaf_value(Y)        
        return Tree_node(value=leaf_value)    
    
    def fit(self, X, Y):
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.create_tree(dataset)
        
    def prediction(self, x, tree):
        if tree.value!=None:
            return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.prediction(x, tree.left)
        else:
            return self.prediction(x, tree.right)
        
    def predict(self, X):
        preditions = [self.prediction(x, self.root) for x in X]
        return preditions
     
    
    
    

In [6]:
X = playTennis_df.iloc[:, :-1].values
Y = playTennis_df.iloc[:, -1].values.reshape(-1,1)

In [162]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=65)

In [163]:
classifier = Classifier(min_samples_split=3, max_depth=3)
classifier.fit(X_train,Y_train)

In [164]:
Y_pred = classifier.predict(X_test) 
accuracy_score(Y_test, Y_pred)

1.0

In [165]:
print(mean_squared_error(Y_test,Y_pred))
print(np.sqrt(mean_squared_error(Y_test,Y_pred)))

0.0
0.0


### Classification of the above Tennis data using KNN model

In [29]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))


In [34]:
class KNN:
    
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self.predict_each_elem(x) for x in X]
        return np.array(y_pred)

    def predict_each_elem(self, x):
        
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_idx = np.argsort(distances)
        k_idx=k_idx[: self.k]
        k_neighbor_labels = [self.y_train[i] for i in k_idx]
        most_common = Counter(k_neighbor_labels)
        most_common=most_common.most_common(1)
        return most_common[0][0]
    
    def accuracy(self,y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy
    

In [35]:
from sklearn.model_selection import train_test_split
from collections import Counter

X = playTennis_df.iloc[:, :-1].values
Y = playTennis_df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=65)
k = 3
model = KNN(k=k)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy= model.accuracy(y_test, predictions)
print("KNN classification accuracy",accuracy)

KNN classification accuracy 1.0


In [40]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
rmse=np.sqrt(mean_squared_error(y_test,predictions))
print("RMSE:",mse)
print("confusion_matrix:",confusion_matrix(y_test,predictions))

RMSE: 0.0
confusion_matrix: [[3]]


### Classification of the above Tennis data using SVM model

In [108]:
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.05, n_iters=500):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        #y_ = np.where(y <= 0, -1, 1)
        self.w = np.zeros(n_features)
        self.b = 0
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (
                        2 * self.lambda_param * self.w - np.dot(x_i, y[idx])
                    )
                    self.b -= self.lr * y[idx]

    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        pred=np.sign(approx)
        print("Predicted output is ",pred)
        return pred

In [121]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix

X = playTennis_df.iloc[:, :-1].values
Y = playTennis_df.iloc[:, -1].values
y = np.where(Y == 0, -1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=65)
model = SVM()
model.fit(X_train, y_train)
print("Updated weights are {} and \n updated bias value is {}".format(model.w, model.b))
print("Y_test value is :",y_test)
predictions = model.predict(X_test)

Updated weights are [-0.54373958  0.11610805  1.12041139 -0.30945494] and 
 updated bias value is -0.18500000000000014
Y_test value is : [1 1 1]
Predicted output is  [1. 1. 1.]


In [122]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [123]:
accuracy= accuracy(y_test, predictions)
print("SVM classification accuracy",accuracy)
rmse=np.sqrt(mean_squared_error(y_test,predictions))
print("RMSE:",rmse)
print("confusion_matrix:",confusion_matrix(y_test,predictions))

SVM classification accuracy 1.0
RMSE: 0.0
confusion_matrix: [[3]]
