In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from mapping import map_str_values
from sklearn import metrics


In [2]:
X = pd.read_csv("data/train_values.csv", index_col="building_id")
Y = pd.read_csv("data/train_labels.csv", index_col="building_id") - 1
# X = map_str_values(X)

In [3]:
np.unique(Y)

array([0, 1, 2], dtype=int64)

In [4]:
X = pd.get_dummies(X, dtype=int)
X.dtypes

geo_level_1_id              int64
geo_level_2_id              int64
geo_level_3_id              int64
count_floors_pre_eq         int64
age                         int64
                            ...  
plan_configuration_u        int32
legal_ownership_status_a    int32
legal_ownership_status_r    int32
legal_ownership_status_v    int32
legal_ownership_status_w    int32
Length: 68, dtype: object

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=192)
X_train.shape

(195450, 68)

In [6]:
np.where(Y["damage_grade"].to_numpy() == 0)

(array([     7,      9,     36, ..., 260557, 260567, 260589], dtype=int64),)

In [34]:
class MixedNB():
    def __init__(self, gauss_index=None, bernouli_index=None, alpha=1):
        self.gauss_index = gauss_index
        self.bernouli_index = bernouli_index
        self.alpha = alpha


    def fit(self, X, Y):
        self.classes = np.unique(np.array(Y))
        self.n_features = X.shape[1]
        n_class = len(self.classes)
        self.class_priors = np.empty(shape=(n_class))

        self._calculate_stuffs(X, Y)

    def _calculate_stuffs(self, X, Y):
        P_ic = []
        means = []
        vars_ = []

        for k, c in enumerate(self.classes):
            Xc = X.iloc[np.where(Y == c)]
            temp = []
            temp_mean = []
            temp_var = []
            self.class_priors[k] = (len(Xc) + self.alpha) / (len(X) + self.alpha * len(self.class_priors))
            
            for i in range(X.shape[1]):
                feat_probs_c = np.sum(Xc.iloc[:, i]) / len(Xc)

                temp.append(feat_probs_c)
                temp_mean.append(np.mean(Xc.iloc[:, i]))
                temp_var.append(np.var(Xc.iloc[:, i]))

            P_ic.append(temp)
            means.append(temp_mean)
            vars_.append(temp_var)

        self.P_ic = np.array(P_ic)
        self.means = np.array(means)
        self.vars = np.array(vars_)


    def _pdf_binomial(self, x, feat_probs_class):
        return (feat_probs_class ** x) * ((1 - feat_probs_class) ** (1 - x))
    

    def _pdf_gauss(self, x, mean, variance):
        exponent = np.exp(-((x - mean) ** 2) / (2 * variance))
        return (1 / np.sqrt(2 * np.pi * variance)) * exponent
    
    
    def _get_joint_probs(self, X):

        class_probs = []
        for x in X:
            # print(x)
            posts = []

            for i in range(len(self.class_priors)):
                post = np.log(self.class_priors[i])

                for j in range(X.shape[1]):
                    # print("buoi")
                    if j in self.gauss_index:
                        post += np.log(self._pdf_gauss(x=x[j], mean=self.means[i,j], variance=self.vars[i, j]))
                    elif j in self.bernouli_index:
                        # print("loz")
                        post += np.log(self._pdf_binomial(x=x[j], feat_probs_class=self.P_ic[i, j]))

                posts.append(post)
            class_probs.append(posts)

        return np.array(class_probs)
    
    
    def predict(self, X):
        probs = self._get_joint_probs(X)
        # print(probs)
        Y_preds = []
        for prob in probs:
            Y_preds.append(self.classes[np.argmax(prob)])

        return Y_preds



In [35]:
mixNB = MixedNB(range(0, 6), range(7,X_train.shape[1]))
mixNB.fit(X_train, Y_train["damage_grade"])


In [36]:
preds = mixNB.predict(X_test.to_numpy())
preds

  post += np.log(self._pdf_binomial(x=x[j], feat_probs_class=self.P_ic[i, j]))
  post += np.log(self._pdf_binomial(x=x[j], feat_probs_class=self.P_ic[i, j]))


[0,
 2,
 1,
 1,
 2,
 2,
 0,
 2,
 2,
 2,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 2,
 2,
 0,
 1,
 2,
 1,
 1,
 2,
 2,
 2,
 2,
 0,
 2,
 1,
 2,
 2,
 0,
 2,
 1,
 2,
 2,
 1,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 1,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 0,
 1,
 0,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 2,
 0,
 2,
 0,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 2,
 0,
 2,
 2,
 0,
 2,
 0,
 2,
 1,
 2,
 2,
 0,
 1,
 1,
 0,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 0,
 2,
 1,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 1,
 2,
 2,
 1,
 2,
 0,
 0,
 2,
 2,
 1,
 0,
 2,
 2,
 2,
 0,
 2,
 1,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 2,
 1,
 0,
 1,
 2,
 1,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 1,
 0,
 1,
 2,
 2,
 0,
 0,
 1,
 2,
 1,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,


In [37]:
print(metrics.classification_report(Y_test, preds))

              precision    recall  f1-score   support

           0       0.39      0.43      0.41      6275
           1       0.65      0.32      0.43     37165
           2       0.40      0.74      0.52     21711

    accuracy                           0.47     65151
   macro avg       0.48      0.50      0.45     65151
weighted avg       0.54      0.47      0.46     65151



In [10]:
gauss = GaussianNB()
gauss.fit(X_train.iloc[:, :6], Y_train["damage_grade"])


In [11]:
gauss.class_prior_
# print(gauss.theta_)
probs = gauss.predict_log_proba(X_test.iloc[:, :6])
probs.shape

(65151, 3)

In [13]:
temp = X_train[Y_train["damage_grade"] == 1]
np.mean(temp.iloc[:, 0])

13.3599384305183

In [14]:
means = gauss.theta_
vars_ = gauss.var_

def calculate_feature_probability(x, mean, variance):
    exponent = np.exp(-((x - mean) ** 2) / (2 * variance))
    return (1 / np.sqrt(2 * np.pi * variance)) * exponent

gauss_probs = calculate_feature_probability(1, means, vars_)
gauss_probs.shape

(3, 6)

In [15]:
mNB = MultinomialNB()
mNB.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


In [16]:
np.exp(mNB.class_log_prior_)

array([0.09643899, 0.56840113, 0.33515989])

In [17]:
bNB = BernoulliNB()
bNB.fit(X_train, Y_train)


  y = column_or_1d(y, warn=True)


In [18]:
np.exp(bNB.class_log_prior_)

array([0.09643899, 0.56840113, 0.33515989])

In [19]:
preds = bNB.predict(X_test)
preds

array([0, 2, 1, ..., 2, 0, 0], dtype=int64)