## Naïve Bayes from scrath

In [109]:
from platform import python_version
print(python_version())

3.7.5


In [9]:
import pandas as pd
import numpy as np
import math 
import pprint
import copy

In [1]:
# This function opens a data file in csv, and transform it into a usable format
def load_data():
    
    # Reading file.
    data = pd.read_csv('data/student.csv', sep=',')
    
    # 'Grade' is the label, the rest are the features
    Y_data = data['Grade']
    X_data = data.drop(columns=['Grade'])
    
    # It is applied factorize function to the features.
    X_data = X_data.apply(lambda feature: pd.factorize(feature)[0])
    
    return X_data, Y_data

In [2]:
# This function splits a data set into a training set and hold-out test set
def split_data(X_data,Y_data,holdout):
    
    # Mask is just an array with "TRUE" of "FALSE" components used to split the data set in train and test.
    mask = np.random.rand(len(X_data)) < holdout
    
    X_train = X_data[mask]
    X_test = X_data[~mask]
    Y_train = Y_data[mask]
    Y_test = Y_data[~mask]

    return X_train, Y_train, X_test, Y_test

In [3]:
# This function builds a supervised NB model
def train(X_train,Y_train,alpha):
   
    # Number. Lengh of the train dataset.
    train_rows = len(X_train)
    
    # Count of labels, stored in a dictionary.
    count_labels = Y_train.value_counts().to_dict()
    
    # Priors of labels, stored in a dictionary.
    priors = dict(map(lambda count_label: 
                           (count_label[0], count_label[1]/train_rows ), count_labels.items()))
    
    # Dictionary to count the values of the instances, stored as dictionary: {feature: {label: {atributte: }}}   
    count = {feature_name: {label: {attr_value: 0 
                                    for attr_value in X_data[feature_name].unique()} 
                            for label in labels} 
             for feature_name in feature_names}
    
    # New dictionary to store the likelihoods (the count will be used for One-R method).
    likelihoods = copy.deepcopy(count)
    
    # Loop for counting.
    for index, row in X_train.iterrows():
        for feature_name in feature_names:
            count[feature_name][Y_train[index]][row[feature_name]] += 1
    
    # Loop for calculate the likelihoods.
    for feature_name in feature_names:
        for label in labels:
            for key in count[feature_name][label]:
                likelihoods[feature_name][label][key] = ( (alpha + count[feature_name][label][key])/
                                                         (alpha*len(count[feature_name][label]) + count_labels[label])
                                                        )
    
    return likelihoods, priors, count

In [4]:
# This function predict the class for an instance or a set of instances, based on a trained model 
def predict(likelihoods,priors,X_test):
    
    # It is created a new dataframe with the same structure (we are interested in the index) and one column.
    prediction = pd.DataFrame(index=X_test.index,columns=['Prediction'])
    
    # We loop for all the instances.
    for index, row in X_test.iterrows():
        maximum = -100
        argmax_y = ""
        
        # For each instance is selected the best prediction.
        for prior_val,prior_prob in priors.items():
            y = math.log(prior_prob)
            for feature_name in feature_names:
                y += math.log(likelihoods[feature_name][prior_val][row[feature_name]])
            
            if y > maximum :
                maximum = y
                argmax_y = prior_val
            
        prediction.at[index,'Prediction'] = argmax_y
    
    return prediction

In [5]:
# This function evaluate a set of predictions in terms of accuracy
def evaluate(Y_test,prediction):
    
    # It is received prediction dataframes with diferent names columns (used for One-R method). 
    column_name = prediction.columns[0]

    test_rows = len(Y_test)
    count = 0
    for index in Y_test.index:
        if Y_test[index] == prediction.at[index, column_name]:
            count += 1
        
    return round(count/test_rows, 2)

In [6]:
# This function predict the class based in One-R method
def one_r_prediction(count,X_test,Y_test):
    
    # It is defined a dictionary for the rules with the structure: {features: {attributes: } }
    rules = {feature_name: {attr_value: "" for attr_value in X_data[feature_name].unique()} 
                            for feature_name in feature_names}
    
    # A dataframe for saving the predictions of each feature.
    one_r_prediction = pd.DataFrame(index=X_test.index,columns=X_test.columns)
    
    # It is setted the rules. For each feature, for each attribute, is selected the attribute value with
    # higher recurrence.
    for feature_name in feature_names:
        for attr_value in X_data[feature_name].unique():
            max_value = 0
            max_label = ""
            for label in labels:
                if count[feature_name][label][attr_value] > max_value:
                    max_value = count[feature_name][label][attr_value]
                    max_label = label
            
            rules[feature_name][attr_value] = max_label
    
    # According to the rules, it is predicted the label for each instance. It is estimated a prediction for
    # each attribute.
    for feature_name in feature_names:
        for index, row in X_test.iterrows():
            one_r_prediction.at[index,feature_name] = rules[feature_name][row[feature_name]]
    
    # It is selected the attribute that makes the best prediction, the minimum error rate.
    min_error_rate = 1
    selected_feature = ""
    for feature_name in feature_names:
        # It is reused the evaluation function for evaluating each feature prediction.
        error_rate = 1 - evaluate(Y_test,one_r_prediction[feature_name].to_frame() )
        if error_rate < min_error_rate :
            min_error_rate = error_rate
            selected_feature = feature_name
        
    return selected_feature, min_error_rate
    

In [7]:
# This function is used for Repeated Random Subsampling reusing the functions defined above.
def repeated_random_subsampling(samples):
    
    accuracy_samples = [None]*samples
    
    for x in range(0,samples):
        X_train, Y_train, X_test, Y_test = split_data(X_data,Y_data,0.8)
        rows = len(X_data)
        labels = Y_data.unique()
        feature_names = list(X_data.columns)
        likelihoods,priors,count = train(X_train,Y_train,1)
        prediction = predict(likelihoods,priors,X_test)
        accuracy_samples[x] = evaluate(Y_test,prediction)
        
    return accuracy_samples, sum(accuracy_samples)/len(accuracy_samples) 

### Accuracy

In [15]:
# Load data
X_data, Y_data = load_data()
# Split dataset with a 80-20 hold-out strategy
X_train, Y_train, X_test, Y_test = split_data(X_data,Y_data,0.8)

# General variables
rows = len(X_data)
labels = Y_data.unique()
feature_names = list(X_data.columns)

# Training, prediction and evaluation
likelihoods,priors,count = train(X_train,Y_train,1)
prediction = predict(likelihoods,priors,X_test)
accuracy = evaluate(Y_test,prediction)

print("Naive Bayes Model")
print("Accuracy:"+ "{:.2f}".format(accuracy))

Naive Bayes Model
Accuracy:0.39


### Discussion

- The accuracy achieved has a value that goes between 0.3 and 0.4, varying in each execution (probably due to the hold-out strategy). Other training strategies, as Cross Validation, should avoid this issue.
- The manual inspection is executed below. To be honest, it is very hard to find a consistent pattern. The only interesting thing is that the labels "A+" and "A" are not properly predict, probably due to there is little data from those classes.


### Prediction

In [16]:
df = X_test.join(Y_test).join(prediction)
df_correct = df[df.Grade==df.Prediction]
df_wrong = df[df.Grade!=df.Prediction]
print("Correct predictions")
print(df_correct.sample(10))
print("Wrong predictions")
print(df_wrong.sample(10))

Correct predictions
     school  sex  address  famsize  Pstatus  Medu  Fedu  Mjob  Fjob  reason  \
55        1    1        0        0        0     2     2     1     1       1   
81        0    0        0        0        0     0     0     0     1       1   
60        0    0        0        0        0     1     2     0     2       3   
6         0    1        1        0        0     2     2     3     1       0   
546       0    0        0        0        0     1     2     1     1       3   
482       0    1        0        0        0     1     1     3     2       0   
229       1    1        0        1        0     0     0     3     4       1   
84        1    1        1        0        0     1     1     1     1       1   
542       0    0        1        0        0     0     1     2     2       2   
220       0    0        1        1        0     1     1     0     1       3   

     ...  romantic  famrel  freetime  goout  Dalc  Walc  health  absences  \
55   ...         0       4       

### Repeated Random Subsampling:

- This method works as the hold-out strategy but iterating multiple times. The key fact is that a new train/set distribution is set and a new model is estimated in each iteration. The only thing that remains constant is the value of the hold-out split.
- It is preferable over a simple hold-out evaluation because produce less variable results, as the evaluation metrics (accuracy in this case) are averaged across the iterations.
- Above it is the function definition called here (for consistency, all the functions are defined above). As it is expected, the value obtained for the accuracy has less variability. As we can check, for 10 samples, the accuracy goes between 0.31 and 0.41. But averaging this metric, with the Repeated Random Subsampling method the result has almost no variability, with values usually between 0.35 and 0.36.

In [13]:
# For 10 samples, this cell should last 10 seconds or less
accuracy_samples, average = repeated_random_subsampling(10)
print("Accuracy for each sample: ")
print(accuracy_samples)
print("Average accuracy: "+ "{:.2f}".format(average) )

Accuracy for each sample: 
[0.36, 0.34, 0.42, 0.35, 0.32, 0.36, 0.34, 0.34, 0.31, 0.36]
Average accuracy: 0.35


### Model Comparison: One-R Method

- It consists on choosing the best attribute for predict an instance's class. For each attribute, is reviewed all its values: it is created a rule assigning to the value attribute the most frequent class. Finally, it is selected the attribute with the smallest error rate. 
- Above it is defined the function called here. The Naive Bayes classifier has, in general, a better performance. Even though is not a big difference (0.37 versus 0.35 in this final execution). The estimation for the One-R method still suffer of the variability of the hold-out strategy, even different features are chosen in each execution. However, usually the best features are "Medu" and "Fedu" (parent education). Obviously, in the context of this problem, this makes a lot of sense.


In [17]:
selected_feature, error_rate = one_r_prediction(count,X_test,Y_test)
print("One R Method Feature: " + selected_feature)
print("Accuraccy:"+ "{:.2f}".format(1-error_rate) )

One R Method Feature: Fedu
Accuraccy:0.36


End.