## Machine Learning - Lab 04 - Naive Bayes Classification
* Full name: Đinh Anh Huy
* Student ID: 18110103

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.preprocessing import LabelEncoder

In [None]:
def label_encoder(data):
  label = LabelEncoder()
  data_columns = data.dtypes.pipe(lambda X: X[X=='object']).index
  for col in data_columns:
      data[col] = label.fit_transform(data[col])
  return data

def classify_feature(X):
  cate_feats = X.dtypes.pipe(lambda X: X[X=='object']).index
  columns = list(X.columns)
  cat_features = []
  num_features = []
  for i in range(X.shape[1]):
    if columns[i] in cate_feats:
      cat_features.append(i)
    else:
      num_features.append(i)
  return cat_features, num_features

def preprocessing(X, y, encoder=True):
  X1 = X.copy()
  y1 = y.copy()
  if any(X1.isna().sum()) != 0:
    rows_with_nan = [index for index, row in X1.iterrows() if row.isnull().any()]
    X1 = X1.dropna()
    y1 = y1.drop(index = rows_with_nan)
  if encoder:
    X1 = label_encoder(X1)
  return X1, y1

### Bài tập 1. Dùng các features: *Sex*, *Pclass*, và *Embarked* để xây dựng thuật toán *Naive Bayes Categorical*.

In [None]:
class Categorical_NB:
  def __init__(self, alpha = 1):
    self.class_priors = []
    self.likelihoods = {}
    # self.pred_priors = {}

    self.train_size = int
    self.num_feats = int
    self.alpha = alpha

  def _calc_class_prior(self, y):

    """ P(c) - Prior Class Probability """

    for outcome in np.unique(y):
      self.class_priors.append(sum(y == outcome) / self.train_size)

  def _calc_likelihood(self, X, y):

    """ P(x|c) - Likelihood """

    for feat in range(self.num_feats):
      feature_col = X[:,feat]
      for outcome in np.unique(y):
        outcome_count = sum(y == outcome)
        uniq, count = np.unique(feature_col[y == outcome], return_counts=True)
        feat_likelihood = {a: b for a,b in zip(uniq, count)}

        for feat_val, count in feat_likelihood.items():
          self.likelihoods[feat][str(feat_val)+'_'+str(outcome)] = count / outcome_count

  def get_likelihood(self, X):
    likelihoods = {}
    for outcome in range(len(self.class_priors)):  
      temp = []
      for query in X:
        
        likelihood = 1

        for feat, feat_val in zip(range(self.num_feats), query):
          likelihood *= self.likelihoods[feat][str(feat_val)+'_'+str(outcome)]
        temp.append(likelihood)
      likelihoods[outcome] = temp
    return likelihoods

  def fit(self, X, y):
    if type(X) is not np.ndarray:
      X = X.toarray()

    self.train_size, self.num_feats = X.shape

    for feature in range(self.num_feats):
      self.likelihoods[feature] = {}

      for feat_val in np.unique(X[:,feature]):
        for outcome in np.unique(y):
          self.likelihoods[feature].update({str(feat_val)+'_'+str(outcome): 0})
          

    self._calc_class_prior(y)
    self._calc_likelihood(X,y)

  def predict(self, X):

    """ Calculates Posterior probability P(c|x) """

    results = []
    if type(X) is not np.ndarray:
      X = X.toarray()

    likelihoods = self.get_likelihood(X)

    for idx, query in enumerate(X):
      probs_outcome = {}
      for outcome in range(len(self.class_priors)):
        prior = self.class_priors[outcome]
        
        posterior = (likelihoods[outcome][idx] * prior)

        probs_outcome[outcome] = posterior
    
      result = max(probs_outcome, key = lambda x: probs_outcome[x])
      results.append(result)

    return np.array(results)

  def score(self, X, y):
    return np.mean(y==self.predict(X).T)

In [None]:
data=pd.read_csv('https://raw.githubusercontent.com/huynhthanh98/ML/master/lab-04/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
X = data[["Sex", "Pclass", "Embarked"]]
y = data["Survived"]
print('>> Shape of X before preprocessing : ', X.shape)
X, y = preprocessing(X,y)
print('>> Shape of X after preprocessing : ', X.shape)

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

>> Shape of X before preprocessing :  (891, 3)
>> Shape of X after preprocessing :  (889, 3)


In [None]:
cate_nb = Categorical_NB()
cate_nb.fit(X_train, y_train)
y_pred = cate_nb.predict(X_test)
print('>> Accuracy score : ', cate_nb.score(X_test, y_test))
print('>> Predicted label : \n', cate_nb.predict(X_test))

>> Accuracy score :  0.7808988764044944
>> Predicted label : 
 [0 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 1
 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 0 1 1 1 0 0 0 1 1 1 0 1 0 0 0 1 1 0 1 1 0
 1 1 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0
 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 0 1 1 1 1 0 1 1 1 0 0 1
 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 1 1]


**Kết quả chạy bằng thư viện scikit-learn**

In [None]:
from sklearn.naive_bayes import CategoricalNB

catenb_skl = CategoricalNB().fit(X_train, y_train)

print('>> Accuracy score : ', catenb_skl.score(X_test,y_test))
print('>> Predicted label : \n', catenb_skl.predict(X_test))

>> Accuracy score :  0.7808988764044944
>> Predicted label : 
 [0 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 1
 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 0 1 1 1 0 0 0 1 1 1 0 1 0 0 0 1 1 0 1 1 0
 1 1 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0
 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 0 1 1 1 1 0 1 1 1 0 0 1
 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 1 1]


### Bài tập 2. Dùng các features: *SibSp*, *Parch* và *Fare* để xây dựng thuật toán *Naive Bayes Gaussian* (Không cần normalize).

In [None]:
class Gaussian_NB:
  def __init__(self, alpha = 1):
    self.class_priors = []
    self.likelihoods = {}

    self.train_size = int
    self.num_feats = int
    self.alpha = alpha

  def _calc_class_prior(self, y):

    """ P(c) - Prior Class Probability """

    for outcome in np.unique(y):
      self.class_priors.append(sum(y == outcome) / self.train_size)

  def _calc_likelihood(self, X, y):

    """ P(x|c) - Likelihood """

    for feat in range(self.num_feats):
      feature_col = X[:,feat]
      for outcome in np.unique(y):
        self.likelihoods[feat][outcome]['mean'] = feature_col[y == outcome].mean()
        self.likelihoods[feat][outcome]['variance'] = feature_col[y == outcome].var()

  def get_likelihood(self, X):
    likelihoods = {}
    for outcome in range(len(self.class_priors)):
      temp = []
      for query in X:
        likelihood = 1

        for feat, feat_val in zip(range(self.num_feats), query):
          mean = self.likelihoods[feat][outcome]['mean']
          var = self.likelihoods[feat][outcome]['variance']

          likelihood *= (1/np.sqrt(2*np.pi*var)) * np.exp(-(feat_val - mean)**2 / (2*var))
        temp.append(likelihood)
      likelihoods[outcome] = temp
    return likelihoods

  def fit(self, X, y):
    if type(X) is not np.ndarray:
      X = X.toarray()

    self.train_size, self.num_feats = X.shape

    for feature in range(self.num_feats):
      self.likelihoods[feature] = {}

      for outcome in np.unique(y):
        self.likelihoods[feature].update({outcome: {}})

    self._calc_class_prior(y)
    self._calc_likelihood(X,y)

  def predict(self, X):

    """ Calculates Posterior probability P(c|x) """

    if type(X) is not np.ndarray:
      X = X.toarray()   

    likelihoods = self.get_likelihood(X)
    results = []
    for idx, _ in enumerate(X):
      probs_outcome = {}

      for outcome in range(len(self.class_priors)):
        prior = self.class_priors[outcome]
        posterior_numerator = (likelihoods[outcome][idx] * prior)
        probs_outcome[outcome] = posterior_numerator

      result = max(probs_outcome, key = lambda x: probs_outcome[x])
      results.append(result)

    return np.array(results)
          
  def score(self, X, y):
    return np.mean(y==self.predict(X).T)


In [None]:
X = data[["SibSp", "Parch", "Fare"]]
y = data["Survived"]
print('>> Shape of X before preprocessing : ', X.shape)
X, y = preprocessing(X,y, encoder=False)
print('>> Shape of X after preprocessing : ', X.shape)

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

>> Shape of X before preprocessing :  (891, 3)
>> Shape of X after preprocessing :  (891, 3)


In [None]:
gauss_nb = Gaussian_NB()
gauss_nb.fit(X_train, y_train)
y_pred = gauss_nb.predict(X_test)
print('>> Accuracy score : ', gauss_nb.score(X_test, y_test))
print('>> Predicted label : \n', gauss_nb.predict(X_test))

>> Accuracy score :  0.659217877094972
>> Predicted label : 
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1
 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0]


**Kết quả chạy bằng thư viện scikit-learn**

In [None]:
from sklearn.naive_bayes import GaussianNB

gaussnb_skl = GaussianNB().fit(X_train, y_train)

print('>> Accuracy score : ', gaussnb_skl.score(X_test,y_test))
print('>> Predicted label : \n', gaussnb_skl.predict(X_test))

>> Accuracy score :  0.659217877094972
>> Predicted label : 
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1
 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0]


### Bài tập 3. Từ bộ dữ liệu liên minh hãy xây dựng thuật toán *Naive Bayes* bằng cách dùng tất cả features (mix cả *Categorical* và *Gaussian*. Lưu ý: Không cần chia tập train, test. Dự đoán thẳng trên tập train)

In [None]:
class Mixed_NB:
  def __init__(self, categorical_features, numerical_features):
    self.cate_features = categorical_features
    self.num_features = numerical_features
    self.class_priors = []
    self.train_size = int
    self.cate_model = None
    self.num_model = None


  def _calc_class_prior(self, y):

    """ P(c) - Prior Class Probability """

    for outcome in np.unique(y):
      self.class_priors.append(sum(y == outcome) / self.train_size)


  def fit(self, X, y):
    if type(X) is not np.ndarray:
      X = X.toarray() 
    
    self.train_size, _ = X.shape

    cate_X = X[:, self.cate_features]
    num_X = X[:, self.num_features]

    self._calc_class_prior(y)

    self.cate_model = Categorical_NB()
    self.cate_model.fit(cate_X, y)
    
    self.num_model = Gaussian_NB()
    self.num_model.fit(num_X, y)

  def predict(self, X):

    """ Calculates Posterior probability P(c|x) """

    if type(X) is not np.ndarray:
      X = X.toarray()
    cate_X = X[:, self.cate_features]
    num_X = X[:, self.num_features]

    likelihoods_num = self.num_model.get_likelihood(num_X)
    likelihoods_cate = self.cate_model.get_likelihood(cate_X)

    results = []

    for idx, _ in enumerate(X):
      probs_outcome = {}
      for outcome in range(len(self.class_priors)):
        prior = self.class_priors[outcome]
        likelihood = likelihoods_num[outcome][idx] * likelihoods_cate[outcome][idx]
        posterior = (likelihood * prior)
        probs_outcome[outcome] = posterior
    
      result = max(probs_outcome, key = lambda x: probs_outcome[x])
      results.append(result)

    return np.array(results)

  def score(self, X, y):
    return np.mean(y==self.predict(X).T)


In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/dinhvietcuong1996/Lab-MachineLearningCourse/master/Lab04/lienminh.csv")
df.head()

Unnamed: 0,killsDiff,minionsKilledDiff,wardPlacedDiff,firstBlood,heralds,dragons,teamWins
0,3,-2,13,blue,none,none,red
1,0,-66,0,red,red,red,red
2,-4,-17,0,red,none,blue,red
3,-1,-34,28,red,blue,none,red
4,0,-15,58,red,none,red,red


In [None]:
# Get label
X = df.drop(['teamWins'], axis=1)
y = df['teamWins']
 
# Classify numerical and categorical features
cate_cols, num_cols = classify_feature(X)
 
# Preprocessing data
print('>> Shape of X before preprocessing : ', X.shape)
X, y = preprocessing(X,y)
print('>> Shape of X after preprocessing : ', X.shape)
X = X.to_numpy()
y = LabelEncoder().fit_transform(y)

>> Shape of X before preprocessing :  (9879, 6)
>> Shape of X after preprocessing :  (9879, 6)


In [None]:
mixed_nb = Mixed_NB(categorical_features = cate_cols, numerical_features = num_cols)
mixed_nb.fit(X, y)
y_pred = mixed_nb.predict(X)
print('>> Accuracy score : ', mixed_nb.score(X, y))
print('>> Predicted label : ', mixed_nb.predict(X))

>> Accuracy score :  0.7116104868913857
>> Predicted label :  [0 1 1 ... 1 1 0]
