# OneR Classification Algorithm

OneR (*One Rule*) is a simple algorithm that predicts the class of a sample by finding the most frequent class for the feature values.

In [54]:
import numpy as np

In [55]:
from sklearn.datasets import load_iris
dataset = load_iris()
X = dataset.data
y = dataset.target

n_samples, n_features = X.shape

print("Number of features = ", n_features)
print("Number of samples = ", n_samples)

Number of features =  4
Number of samples =  150


OneR requires discrete categorical features. Iris dataset has continuous features. We need to change the continuous features to categorical features. This process is called **discretization**. 

A simple discretization algorithm can be performed by choosing a threshold value and any value below the threshold is given a value 0. Any value above threshold is given a value 1. We choose mean(average of features) as the threshold.

In [56]:
#calculate the means on features i.e axis = 0
attribute_means = X.mean(axis = 0)
#make sure the shape of attribute_means array is equal to n_features
assert (attribute_means.shape == (n_features,)), "Shape not equal to number of features"
#create an nd_array of 150 samples
X_d = np.array(X >= attribute_means, dtype = 'int')

In [57]:
from sklearn.cross_validation import train_test_split

#random_state is assigned to make sure the output remains same
random_state = 14

X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state = random_state)
print("Number of training samples = {}".format(y_train.shape))
print("Number of testing samples = {}".format(y_test.shape))

Number of training samples = (112,)
Number of testing samples = (38,)


In [58]:
from collections import defaultdict
from operator import itemgetter

def train_feature_value(X, y_true, feature_index, value):
    # Create a simple dictionary to count how frequency they give certain predictions
    class_counts = defaultdict(int)
    # Iterate through each sample and count the frequency of each class/value pair
    for sample, y in zip(X, y_true):
        if sample[feature_index] == value:
            class_counts[y] += 1
    # Now get the best one by sorting (highest first) and choosing the first item
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    # The error is the number of samples that do not classify as the most frequent class
    # *and* have the feature value.
    n_samples = X.shape[1]
    wrong_predictions = [class_count for class_value, class_count in class_counts.items()
                 if class_value != most_frequent_class]
    error = sum(wrong_predictions)
    return most_frequent_class, error

In [59]:
def train_on_feature(X, y_true, feature_index):
    #get all unique value of the variable
    values = set(X[:, feature_index])
    predictors = dict()
    errors = []
    for current_value in values:
        most_frequent_class, error = train_feature_value(X, y_true, feature_index, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)
    # Compute the total error of using this feature to classify on
    total_error = sum(errors)
    return predictors, total_error

In [60]:
# Compute all of the predictors
all_predictors = {variable: train_on_feature(X_train, y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
# Now choose the best and save that as "model"
# Sort by error
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("The best model is based on variable {0} and has error {1:.2f}".format(best_variable, best_error))

# Choose the bset model
model = {'variable': best_variable,
         'predictor': all_predictors[best_variable][0]}
print(model)

The best model is based on variable 2 and has error 37.00
{'variable': 2, 'predictor': {0: 0, 1: 2}}


In [61]:
def predict(X_test, model):
    variable = model['variable']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted
predict(X_test, model)

array([0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 0,
       2, 0, 2, 0, 2, 2, 0, 0, 0, 2, 0, 2, 0, 2, 2])

In [62]:
y_predicted = predict(X_test, model)
print(y_predicted)

[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2
 2]


In [63]:
# Compute the accuracy by taking the mean of the amounts that y_predicted is equal to y_test
accuracy = np.mean(y_predicted == y_test) * 100
print("The test accuracy is {:.1f}%".format(accuracy))

The test accuracy is 65.8%
