In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics 
from sklearn.metrics import classification_report, confusion_matrix

In [30]:
fruit = pd.read_table("fruit_data_with_colors.txt");
fruit.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [31]:
fruit_vector=fruit[["mass","width","height","color_score","fruit_name"]]
fruit_vector.head()

X = fruit_vector.iloc[:,:-1].values
y = fruit_vector.iloc[:,4].values

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4321)

In [37]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [38]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
GaussianNB()
print(confusion_matrix(y_test, y_pred))
print("GNB %):", metrics.accuracy_score(y_test, y_pred)*100)

[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [2 0 0 4]]
GNB %): 88.88888888888889


In [39]:
# Get means and variances of training data and pair them by class /
# Calculate priors of classes /
# Calculate GNB values for test data
# Classify

def fit(x_train, y_train):
    df = pd.DataFrame(x_train)
    mean = df.groupby(by=y_train).mean()
    var = df.groupby(by=y_train).var()
    return mean, var

def mv_pairs_by_class(mean, var, y_train):
    mv_temp = []
    m = np.array(mean)
    v = np.array(var)
    
    for i in range(len(mean)):
        m_row = m[i]
        v_row = v[i]
        for index, value in enumerate(m_row):
            mean = value
            var = v_row[index]
            mv_temp.append([mean, var])
    mv_pairings = np.array(mv_temp)
    n_class = len(np.unique(y_train))
    s = np.vsplit(mv_pairings, n_class)
    return s

def calc_prior(classifier, y_train):
    count = 0
    total = len(y_train)
    for i in range(total):
        if y_train[i] == classifier:
            count += 1
    return count / total

def gnb_calc(x_val, x_mean, x_var):
    eq_pt1 = 1 / (np.sqrt(2 * np.pi * x_var))
    expon = np.exp(-(((x_val - x_mean) ** 2) / (2 * x_var)))
    prob = eq_pt1 * expon
    return prob

def classify(x_test, classes, mv_pairs, priors):
    p_temp = []
    for i in range(len(classes)):
        a_class = mv_pairs[i]
        for j in range(len(a_class)):
            class_x_mean = a_class[j][0]
            class_x_var = a_class[j][1]
            x_value = x_test[j]
            p_temp.append([gnb_calc(x_value, class_x_mean, class_x_var)])
    prob = np.array(p_temp)
    prob_class = np.vsplit(prob, len(classes))
    j = 0
    final_probs = []
    for i in prob_class:
        class_prob = np.prod(i) * priors[j]
        final_probs.append(class_prob)
        j += 1
        
    index = final_probs.index(max(final_probs))
    return classes[index]

def GNB(x_train, x_test, y_train, y_test):
    
    mean, var = fit(x_train, y_train)
    mv_pairs = mv_pairs_by_class(mean, var, y_train)
    
    classes = np.unique(y_train)
    priors = []
    for i in range(len(classes)):
        priors.append(calc_prior(classes[i], y_train))
        
    results = []
    count = 0
    for i in range(len(x_test)):
        results.append(classify(x_test[i], classes, mv_pairs, priors))
        print(results[i], y_test[i])
        if results[i] == y_test[i]:
            count += 1
    accuracy = (count / len(x_test)) * 100
    print(confusion_matrix(y_test, results))
    print("accuracy: ", accuracy, "%")
    
GNB(X_train, X_test, y_train, y_test)

apple apple
orange orange
apple orange
apple apple
apple orange
mandarin mandarin
apple apple
lemon lemon
orange orange
orange orange
apple apple
orange orange
apple apple
lemon lemon
mandarin mandarin
apple apple
apple apple
mandarin mandarin
[[7 0 0 0]
 [0 2 0 0]
 [0 0 3 0]
 [2 0 0 4]]
accuracy:  88.88888888888889 %
