# Question 3
Implement maximum likelihood classification in python. Input to your program is CSV
file, with last attribute assumed to class label. You can assume all non-class attributes
are continuous random variables. It should take two input files, training file for
constructing model and test file to estimate various accuracy measures.

In [10]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import math

In [11]:
iris = pd.read_csv('iris.csv').values
np.random.shuffle(iris)
train, test = train_test_split(iris, test_size = 0.5)
vals = train[:,:-1].astype('float')
labels = np.unique(train[:, -1])
num_observations = len(train)
num_labels = len(vals[0])

## Maximum Likelihood Classifier

In [12]:
# for each x value, calculate g(x) per label, and select the label with highest g(x) value

# get respective means and cov matrices per label
def get_label_stats(vals, labels):
    label_stats = {}

    for label in labels:
        label_stats[label] = {}
        # apply boolean mask
        label_data = vals[vals[:, -1] == label]
        # remove label column
        label_data = label_data[:,:-1].astype('float')
        num_samples = len(label_data)
        label_stats[label]['mean'] = label_data.mean(axis = 0)
        label_stats[label]['cov'] = np.asmatrix(np.cov(label_data.T))
        label_stats[label]['probability'] = float(num_samples/num_observations)

    return label_stats

In [13]:
# classify each x value by selecting the max g(x) value from each label
# label(x) = max(g(x, label1), g(x, label2), ..., g(x, labeln))
def ml_classifier(label_stats, vals):
    classifier_label = []
    for x in vals:
        # calculate g(x) per label
        curr_label = None
        prev_g_x = 0.0
        max_g_x = 0.0
        for label in label_stats:
            stats = label_stats[label]
            mean = stats['mean']
            cov = stats['cov']
            prob = stats['probability']
            g_x = -0.5 * ((x - mean).T @ np.linalg.inv(cov) @ (x - mean))
            constant = np.log(prob) - (len(vals[0])/2)
            if curr_label is None:
                curr_label = label            
                max_g_x = g_x
                prev_g_x = g_x
            else:
                prev_g_x = max_g_x
                max_g_x = max(max_g_x, g_x)
                if max_g_x != prev_g_x:
                    curr_label = label
        classifier_label.append(curr_label)
    classifier_label = np.array(classifier_label)
    return classifier_label

## Train Classifier

In [14]:
label_stats = get_label_stats(train, labels)
classifier_label = ml_classifier(label_stats, vals)
print("Classifier Counts")
unique, classifier_counts = np.unique(classifier_label, return_counts = True)
print("unique: ", unique)
print("counts: ", classifier_counts)
print("Actual Counts")
unique, actual_counts = np.unique(train[:, -1], return_counts = True)
print("unique: ", unique)
print("counts: ", actual_counts)
difference = 0
for i in range(len(classifier_counts)):
    if classifier_counts[i] - actual_counts[i] < 0:
        difference += (classifier_counts[i] - actual_counts[i])
print("Classifier Accuracy: ", float((sum(classifier_counts) + difference) / sum(classifier_counts)) * 100, "%")

Classifier Counts
unique:  ['setosa' 'versicolor' 'virginica']
counts:  [16 30 29]
Actual Counts
unique:  ['setosa' 'versicolor' 'virginica']
counts:  [16 31 28]
Classifier Accuracy:  98.66666666666667 %


## Run Classifier on Test Data

In [15]:
vals = test[:,:-1].astype('float')
labels = np.unique(test[:, -1])
num_observations = len(test)
num_labels = len(vals[0])

In [16]:
label_stats = get_label_stats(test, labels)
classifier_counts = ml_classifier(label_stats, vals)
print("Classifier Counts")
unique, classifier_counts = np.unique(classifier_label, return_counts = True)
print("unique: ", unique)
print("counts: ", classifier_counts)
print("Actual Counts")
unique, actual_counts = np.unique(train[:, -1], return_counts = True)
print("unique: ", unique)
print("counts: ", actual_counts)
difference = 0
for i in range(len(classifier_counts)):
    if classifier_counts[i] - actual_counts[i] < 0:
        difference += (classifier_counts[i] - actual_counts[i])
print("Classifier Accuracy: ", float((sum(classifier_counts) + difference) / sum(classifier_counts)) * 100, "%")

Classifier Counts
unique:  ['setosa' 'versicolor' 'virginica']
counts:  [16 30 29]
Actual Counts
unique:  ['setosa' 'versicolor' 'virginica']
counts:  [16 31 28]
Classifier Accuracy:  98.66666666666667 %


# Question 4
Implement entropy and gain functions. For a given data (CSV file, last attribute is class
label), output entropy and gain values for each attribute, and determine root node
attribute. 

In [20]:
def entropy(p):
    if p != 0:
        return -p*np.log2(p)  
    else:
        return 0

In [28]:
column_entropies = {}   
iris = pd.read_csv('iris.csv')   
for col in iris.columns:
    if col != iris.columns[-1]:
        greater_median = []
        less_median= []
        for i, row in iris.iterrows():
            median = iris[col].median()
            if row[col] >= median :
                greater_median.append([row[col], row[iris.columns[-1]]])
            else:
                less_median.append([row[col], row[iris.columns[-1]]])

        labels = [col, iris.columns[-1]]
        greater_median = pd.DataFrame(greater_median, columns=labels)
        less_median = pd.DataFrame(less_median, columns=labels)
        class_1 = greater_median.groupby(iris.columns[-1]).count()[col].tolist()
        class_2 = less_median.groupby(iris.columns[-1]).count()[col].tolist()
        
        class_1_entropy = sum([entropy(val / sum(class_1)) for val in class_1])
        class_2_entropy = sum([entropy(val / sum(class_2)) for val in class_2])            
        column_entropy = ((sum(class_1) * class_1_entropy) + (sum(class_2) * class_2_entropy))/(sum(class_1)+sum(class_2))
        
        print("Entropy of", col,":", column_entropy )
        column_entropies[col] = column_entropy

Entropy of sepal_length : 1.0689219720135832
Entropy of sepal_width : 1.336296325775044
Entropy of petal_length : 0.9182958340544897
Entropy of petal_width : 0.8915883594258673


In [32]:
groups = iris.groupby(iris.columns[-1]).size().tolist()
parent_entropy = sum([entropy(val / sum(groups)) for val in groups])
column_gains = {}
for col in iris.columns:
    if col != iris.columns[-1]:
        column_gains[col] = parent_entropy - column_entropies[col]
        print("Gain for", col, ":", parent_entropy - column_entropies[col] )

Gain for sepal_length : 0.5160405287075729
Gain for sepal_width : 0.2486661749461121
Gain for petal_length : 0.6666666666666664
Gain for petal_width : 0.6933741412952887


In [36]:
root_node = max(column_gains, key = column_gains.get)
print("Root Node: ", root_node, "with optimal split of gain: ", column_gains[root_node])

Root Node:  petal_width with optimal split of gain:  0.6933741412952887
