# ML Lab Sheet - 4 📑

## Name: Anish Agarwal

## Question - 1

### Importing Relevant Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets

### Importing DataSet

In [2]:
data = datasets.load_iris()
data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

### Data PreProcessing

In [3]:
names = data['target_names']

In [4]:
X = data['data']
X = np.array(X, dtype = int)
y = data['target']

In [5]:
data_ = pd.DataFrame(X)
data_.columns = data['feature_names']
data_['Output'] = y

In [6]:
data['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

### Define Entropy

In [7]:
def entropy(col):
    
    counts = np.unique(col,return_counts=True)
    N = float(col.shape[0])
    
    ent = 0.0
    
    for ix in counts[1]:
        p  = ix/N
        ent += (-1.0*p*np.log2(p))
    
    return ent

### Define Information Gain

In [8]:
def information_gain(x_data,key,val):
    left,right = divide_data(x_data,key,val)
    
    #% of total samples are on left and right
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    
    #All examples come to one side!
    if left.shape[0] == 0 or right.shape[0] ==0:
        return -1000000 #Min Information Gain
    
    i_gain = entropy(x_data.Output) - (l*entropy(left.Output)+r*entropy(right.Output))
    return i_gain


### Dividing Data

In [9]:
def divide_data(x_data,key,val):
    x_right = pd.DataFrame([],columns=x_data.columns)
    x_left = pd.DataFrame([],columns=x_data.columns)
    
    for ix in range(x_data.shape[0]):
        val_ = x_data[key].loc[ix]
        
        if val_ > val:
            x_right = x_right.append(x_data.loc[ix])
        else:
            x_left = x_left.append(x_data.loc[ix])
            
    return x_left,x_right


### Finding Count Of 0, 1 And 2

In [10]:
def find_count(X_train):
        count = []
        count.append(X_train[X_train['Output'] == 0].shape[0])
        count.append(X_train[X_train['Output'] == 1].shape[0])
        count.append(X_train[X_train['Output'] == 2].shape[0])
        return count

### Decision Tree Class

In [11]:
class DecisionTree:

    def __init__(self,depth=0,max_depth=5):
        self.left = None
        self.right = None
        self.key = None
        self.val = None
        self.count = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
    
    def train(self,X_train,names):
        
        features = ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']
        info_gains = []
        
        for ix in features:
            i_gain = information_gain(X_train,ix,X_train[ix].mean())
            info_gains.append(i_gain)
            
        self.key = features[np.argmax(info_gains)]
        self.val = X_train[self.key].mean()
        print("Level " , self.depth)
        self.count = find_count(X_train)
        cnt = 0                            ##maintain to get if we have only 1 flower in this node
        for i in range(len(self.count)):
            if(self.count[i]):
                print("Count of " , names[i] , " = " , self.count[i])
                cnt += 1
        print("Current entropy = " , entropy(X_train.Output))
        if cnt != 1:
            print("Splitting on Tree Features ",self.key,"with information gain",np.argmax(info_gains))
        
        data_left,data_right = divide_data(X_train,self.key,self.val)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)

        if cnt == 1:
            if X_train.Output.mean() >= 1.5:
                self.target = names[2]
            elif X_train.Output.mean() <= 0.5:
                self.target = names[0]
            else:
                self.target = names[1]
            print("Reached leaf Node")
            print()
            print()
            return
        
        #Stop earyly when depth >=max depth
        if(self.depth>=self.max_depth):
            if X_train.Output.mean() >= 1.5:
                self.target = names[2]
            elif X_train.Output.mean() <= 0.5:
                self.target = names[0]
            else:
                self.target = names[1]
            print("Max depth Reached")
            print()
            print()
            return
        
        print()
        print()
        
        self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left, names)
        
        self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right, names)

        if X_train.Output.mean() >= 1.5:
            self.target = names[2]
        elif X_train.Output.mean() <= 0.5:
            self.target = names[0]
        else:
            self.target = names[1]
        return    

### Making Object Of The Class

In [12]:
dt = DecisionTree()

### Training Model On The DataSet

In [13]:
dt.train(data_, names)     

Level  0
Count of  setosa  =  50
Count of  versicolor  =  50
Count of  virginica  =  50
Current entropy =  1.584962500721156
Splitting on Tree Features  petal width (cm) with information gain 3


Level  1
Count of  setosa  =  50
Current entropy =  0.0
Reached leaf Node


Level  1
Count of  versicolor  =  50
Count of  virginica  =  50
Current entropy =  1.0
Splitting on Tree Features  petal length (cm) with information gain 2


Level  2
Count of  versicolor  =  48
Count of  virginica  =  6
Current entropy =  0.5032583347756457
Splitting on Tree Features  petal width (cm) with information gain 3


Level  3
Count of  versicolor  =  48
Count of  virginica  =  5
Current entropy =  0.45079138835466503
Splitting on Tree Features  petal length (cm) with information gain 2


Level  4
Count of  versicolor  =  11
Current entropy =  0.0
Reached leaf Node


Level  4
Count of  versicolor  =  37
Count of  virginica  =  5
Current entropy =  0.5266170655714281
Splitting on Tree Features  sepal length (

## Question- 2

### Importing Relevant Libraries

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Importing DataSet

In [15]:
dataset = pd.read_csv('https://raw.githubusercontent.com/anishaga/Machine-Learning/main/Lab%20Sheet%204/cancer.csv')
dataset

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
678,776715,3,1,1,1,3,2,1,1,1,2
679,841769,2,1,1,1,2,1,1,1,1,2
680,888820,5,10,10,3,7,3,8,10,2,4
681,897471,4,8,6,4,3,4,10,6,1,4


In [16]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

### Splitting The DataSet Into Training And Testing

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_state = 0)

### Feature Scaling

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training Model On Testing Data

In [19]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

### Predicting Test Set Results

In [20]:
y_pred = classifier.predict(X_test)

### Making Confusion Matrix

In [21]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[102   5]
 [  6  58]]


### Finding Accuracy Score

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.935672514619883

## Question 3

### Importing Relevant Libraries

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Importing DataSet

In [24]:
dataset = pd.read_csv('https://raw.githubusercontent.com/anishaga/Machine-Learning/main/Lab%20Sheet%204/cancer.csv')
dataset

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
678,776715,3,1,1,1,3,2,1,1,1,2
679,841769,2,1,1,1,2,1,1,1,1,2
680,888820,5,10,10,3,7,3,8,10,2,4
681,897471,4,8,6,4,3,4,10,6,1,4


In [25]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

### Splitting The DataSet Into Training And Testing

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_state = 0)

### Feature Scaling

In [27]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training Model On Testing Data

In [28]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

### Predicting Test Set Results

In [29]:
y_pred = classifier.predict(X_test)

### Making Confusion Matrix

In [30]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[103   4]
 [  5  59]]


### Finding Accuracy Score

In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9473684210526315

## Question 4

### Importing Relevant Libraries

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Importing DataSet

In [33]:
dataset = pd.read_csv('https://raw.githubusercontent.com/anishaga/Machine-Learning/main/Lab%20Sheet%204/cancer.csv')
dataset

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
678,776715,3,1,1,1,3,2,1,1,1,2
679,841769,2,1,1,1,2,1,1,1,1,2
680,888820,5,10,10,3,7,3,8,10,2,4
681,897471,4,8,6,4,3,4,10,6,1,4


In [34]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

### Splitting The DataSet Into Training And Testing

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_state = 0)

### Feature Scaling

In [36]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training Model On Testing Data

In [37]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

### Predicting Test Set Results

In [38]:
y_pred = classifier.predict(X_test)

### Making Confusion Matrix

In [39]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[99  8]
 [ 2 62]]


### Finding Accuracy Score

In [40]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9415204678362573