# Bayesian Learning

## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from statistics import mean 

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler
%matplotlib inline

## 1. Divide into training set and test set

In [2]:
df = pd.read_csv('Train_F.csv')

In [3]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [4]:
df.head()

Unnamed: 0,countyfips,countyname,statename,predicted_deaths_by_october_06,predicted_deaths_by_october_07,predicted_deaths_by_october_08,predicted_deaths_by_october_09,predicted_deaths_by_october_10,predicted_deaths_by_october_11,predicted_deaths_by_october_12,severity_county_5-day
0,36047,Kings,NY,7348.2,7356.8,7366.0,7375.6,7385.8,7396.5,7407.8,3
1,36081,Queens,NY,7274.1,7283.5,7293.3,7303.7,7314.6,7326.1,7338.2,3
2,6037,Los Angeles,CA,6667.5,6679.6,6691.9,6704.5,6717.4,6730.6,6744.3,3
3,17031,Cook,IL,5273.9,5287.0,5300.5,5314.3,5328.6,5343.3,5358.5,3
4,36005,Bronx,NY,4970.3,4982.1,4994.5,5007.4,5021.1,5035.3,5050.2,3


In [5]:
df['severity_county_5-day'] = df['severity_county_5-day'].replace([1],0)
df['severity_county_5-day'] = df['severity_county_5-day'].replace([2],1)
df['severity_county_5-day'] = df['severity_county_5-day'].replace([3],2)

In [6]:
df.head()

Unnamed: 0,countyfips,countyname,statename,predicted_deaths_by_october_06,predicted_deaths_by_october_07,predicted_deaths_by_october_08,predicted_deaths_by_october_09,predicted_deaths_by_october_10,predicted_deaths_by_october_11,predicted_deaths_by_october_12,severity_county_5-day
0,36047,Kings,NY,7348.2,7356.8,7366.0,7375.6,7385.8,7396.5,7407.8,2
1,36081,Queens,NY,7274.1,7283.5,7293.3,7303.7,7314.6,7326.1,7338.2,2
2,6037,Los Angeles,CA,6667.5,6679.6,6691.9,6704.5,6717.4,6730.6,6744.3,2
3,17031,Cook,IL,5273.9,5287.0,5300.5,5314.3,5328.6,5343.3,5358.5,2
4,36005,Bronx,NY,4970.3,4982.1,4994.5,5007.4,5021.1,5035.3,5050.2,2


In [7]:
train_df, test_df = train_test_split(df, 0.2)
X_train_df = train_df.iloc[:,:-1]
y_train_df = train_df.iloc[:, -1]
X_test_df = test_df.iloc[:,:-1]
y_test_df = test_df.iloc[:, -1]

### 1.1 Handle Missing Values

In [8]:
df = df.replace('?',np.nan)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   countyfips                      3142 non-null   int64  
 1   countyname                      3142 non-null   object 
 2   statename                       3142 non-null   object 
 3   predicted_deaths_by_october_06  3142 non-null   float64
 4   predicted_deaths_by_october_07  3142 non-null   float64
 5   predicted_deaths_by_october_08  3142 non-null   float64
 6   predicted_deaths_by_october_09  3142 non-null   float64
 7   predicted_deaths_by_october_10  3142 non-null   float64
 8   predicted_deaths_by_october_11  3142 non-null   float64
 9   predicted_deaths_by_october_12  3142 non-null   float64
 10  severity_county_5-day           3142 non-null   int64  
dtypes: float64(7), int64(2), object(2)
memory usage: 270.1+ KB


Clearly, there are no missing values in any columns

### 1.2 Encoding Categorical Data

Adding new columns containing numerical labels instead of categorical data.

In [9]:
labelencoder = LabelEncoder()

X_train_df['countyname_cat'] = labelencoder.fit_transform(X_train_df['countyname'])
X_train_df['statename_cat'] = labelencoder.fit_transform(X_train_df['statename'])
X_test_df['countyname_cat'] = labelencoder.fit_transform(X_test_df['countyname'])
X_test_df['statename_cat'] = labelencoder.fit_transform(X_test_df['statename'])

X_train_df = X_train_df.drop(['countyname','statename'], axis = 1) 
X_test_df = X_test_df.drop(['countyname','statename'],axis = 1)

In [10]:
print(X_train_df.shape,X_test_df.shape)
new_train_df = pd.concat([X_train_df, y_train_df], axis=1)
new_test_df = pd.concat([X_test_df, y_test_df],axis = 1)
print(new_train_df.shape,new_test_df.shape)
new_test_df.head()

(2514, 10) (628, 10)
(2514, 11) (628, 11)


Unnamed: 0,countyfips,predicted_deaths_by_october_06,predicted_deaths_by_october_07,predicted_deaths_by_october_08,predicted_deaths_by_october_09,predicted_deaths_by_october_10,predicted_deaths_by_october_11,predicted_deaths_by_october_12,countyname_cat,statename_cat,severity_county_5-day
2027,20061,3.4,3.7,4.1,4.5,5.0,5.4,5.8,174,13,2
2361,19023,2.3,2.5,2.8,3.0,3.2,3.4,3.7,63,9,0
2470,25019,1.3,1.5,1.8,2.1,2.4,2.7,3.1,321,16,0
2596,46087,1.3,1.5,1.8,2.0,2.3,2.6,2.9,292,37,0
1933,27165,4.3,4.6,4.9,5.2,5.5,5.8,6.2,477,20,1


In [11]:
new_X_train_df = new_train_df.iloc[:,:-1]
new_y_train_df = new_train_df.iloc[:, -1]
new_X_test_df = new_test_df.iloc[:,:-1]
new_y_test_df = new_test_df.iloc[:, -1]

### 1.3 Naive Bayes Classifier 

Splitting the train dataframe into 5 equals parts randomly for 5 fold cross-validation

In [12]:
_,fold1_df = train_test_split(new_train_df, 0.2)
_,__ = train_test_split(_,0.5)
fold2_df,fold3_df = train_test_split(_,0.5)
fold4_df,fold5_df = train_test_split(__,0.5)

In [13]:
fold1_df.shape,fold2_df.shape,fold3_df.shape,fold4_df.shape,fold5_df.shape

((503, 11), (503, 11), (502, 11), (503, 11), (503, 11))

In [14]:
frames = [fold2_df,fold3_df,fold4_df,fold5_df]
train1_df = pd.concat(frames)

frames = [fold1_df,fold3_df,fold4_df,fold5_df]
train2_df = pd.concat(frames)

frames = [fold1_df,fold2_df,fold4_df,fold5_df]
train3_df = pd.concat(frames)

frames = [fold1_df,fold2_df,fold3_df,fold5_df]
train4_df = pd.concat(frames)

frames = [fold1_df,fold2_df,fold3_df,fold4_df]
train5_df = pd.concat(frames)

In [15]:
train1_df.shape,train2_df.shape,train3_df.shape,train4_df.shape,train5_df.shape,

((2011, 11), (2011, 11), (2012, 11), (2011, 11), (2011, 11))

In [16]:
X_train1_df = train1_df.iloc[:,:-1]
y_train1_df = train1_df.iloc[:, -1]
X_test1_df = fold1_df.iloc[:,:-1]
y_test1_df = fold1_df.iloc[:, -1]

X_train2_df = train2_df.iloc[:,:-1]
y_train2_df = train2_df.iloc[:, -1]
X_test2_df = fold2_df.iloc[:,:-1]
y_test2_df = fold2_df.iloc[:, -1]

X_train3_df = train3_df.iloc[:,:-1]
y_train3_df = train3_df.iloc[:, -1]
X_test3_df = fold3_df.iloc[:,:-1]
y_test3_df = fold3_df.iloc[:, -1]

X_train4_df = train4_df.iloc[:,:-1]
y_train4_df = train4_df.iloc[:, -1]
X_test4_df = fold4_df.iloc[:,:-1]
y_test4_df = fold4_df.iloc[:, -1]

X_train5_df = train5_df.iloc[:,:-1]
y_train5_df = train5_df.iloc[:, -1]
X_test5_df = fold5_df.iloc[:,:-1]
y_test5_df = fold5_df.iloc[:, -1]

In [17]:
X_train_dfs = [X_train1_df,X_train2_df,X_train3_df,X_train4_df,X_train5_df]

In [18]:
y_train_dfs = [y_train1_df,y_train2_df,y_train3_df,y_train4_df,y_train5_df]

In [19]:
X_test_dfs = [X_test1_df,X_test2_df,X_test3_df,X_test4_df,X_test5_df]

In [20]:
y_test_dfs = [y_test1_df,y_test2_df,y_test3_df,y_test4_df,y_test5_df]

In [21]:
def get_params(X_train, y_train): 
    """
    Function to get the unique classes, number of classes and number of features in training data
    """
    num_examples, num_features = X_train.shape
    num_classes = len(np.unique(y_train))
    return num_examples, num_features, num_classes

In [22]:
num_examples, num_features, num_classes = get_params(new_X_train_df, new_y_train_df)
print(num_examples, num_features, num_classes)

2514 10 3


In [23]:
def get_stats_by_class(X_train, y_train, num_examples=num_examples, num_classes=num_classes): 
    """
    Get stats of dataset by the class
    """
    # dictionaries to store stats
    class_mean = {}
    class_var = {} 
    class_prior = {} 
    
    # loop through each class and get mean, variance and prior by class
    for cls in range(num_classes): 
        X_cls = X_train[y_train == cls]
        class_mean[str(cls)] = np.mean(X_cls, axis=0)
        class_var[str(cls)] = np.var(X_cls, axis=0)
        class_prior[str(cls)] = X_cls.shape[0] / num_examples
    return class_mean, class_var, class_prior

In [24]:
cm, var, cp = get_stats_by_class(new_X_train_df, new_y_train_df)
print(f"mean: {cm}\n\nvariance: {var}\n\npriors: {cp}")

mean: {'0': countyfips                        29758.515535
predicted_deaths_by_october_06        5.999540
predicted_deaths_by_october_07        6.160299
predicted_deaths_by_october_08        6.313119
predicted_deaths_by_october_09        6.467779
predicted_deaths_by_october_10        6.629229
predicted_deaths_by_october_11        6.796778
predicted_deaths_by_october_12        6.970656
countyname_cat                      787.649022
statename_cat                        25.538550
dtype: float64, '1': countyfips                        31676.261671
predicted_deaths_by_october_06       36.498649
predicted_deaths_by_october_07       36.693857
predicted_deaths_by_october_08       36.894472
predicted_deaths_by_october_09       37.097420
predicted_deaths_by_october_10       37.301843
predicted_deaths_by_october_11       37.507125
predicted_deaths_by_october_12       37.720762
countyname_cat                      779.702703
statename_cat                        27.191646
dtype: float64, '2': county

In [25]:
def gaussian_density_function(X, mean, std, num_examples=num_examples, num_features=num_features, eps=1e-6): 
    num_examples, num_features = X.shape
    const = -num_features/2 * np.log(2*np.pi) - 0.5 * np.sum(np.log(std + eps))
    probs = 0.5 * np.sum(np.power(X - mean, 2)/(std + eps), 1)
    return const - probs

In [26]:
gaussian_density_function(new_X_train_df, cm[str(0)], var[str(0)])

0      -2.078093e+06
1      -2.037784e+06
2      -1.716774e+06
3      -1.078356e+06
4      -9.574539e+05
            ...     
3136   -4.814617e+01
3137   -4.519264e+01
3138   -4.584250e+01
3139   -4.633950e+01
3140   -4.675585e+01
Length: 2514, dtype: float64

In [27]:
def class_probabilities(X, class_mean, class_var, class_prior, num_classes=num_classes):
    """
    calculate the probability of each class given the data
    """
    num_examples = X.shape[0]
    probs = np.zeros((num_examples, num_classes))

    for cls in range(num_classes): 
        prior = class_prior[str(cls)]
        probs_cls = gaussian_density_function(X, class_mean[str(cls)], class_var[str(cls)])
        probs[:, cls] = probs_cls + np.log(prior)
    return probs

In [28]:
probs = class_probabilities(new_X_train_df, cm, var, cp)
probs

array([[-2.07809432e+06, -2.14476556e+04, -5.88187556e+02],
       [-2.03778478e+06, -2.10311539e+04, -5.78358531e+02],
       [-1.71677532e+06, -1.77157895e+04, -4.98842621e+02],
       ...,
       [-4.69047891e+01, -6.22859360e+01, -7.50791167e+01],
       [-4.74017839e+01, -6.28785556e+01, -7.54824091e+01],
       [-4.78181413e+01, -6.28173840e+01, -7.57549310e+01]])

In [29]:
def predict(X_test, X_train, y_train): 
    num_examples, num_features, num_classes = get_params(X_test, y_train)
    class_mean, class_std, class_prior = get_stats_by_class(X_train, y_train)
    probs = class_probabilities(X_test, class_mean, class_std, class_prior)
    return np.argmax(probs, 1)

In [30]:
my_preds = predict(new_X_test_df, new_X_train_df, new_y_train_df)

In [31]:
my_preds

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 2, 0, 0, 0, 1, 2, 1, 1, 1, 0, 1, 0, 0, 2, 0, 1,
       0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1,
       0, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 2, 1,

In [32]:
print(f"my predictions accuracy:{accuracy_score(new_y_test_df, my_preds)}")

my predictions accuracy:0.47611464968152867


In [33]:
my_preds_acc = []
for i in range(5):
    my_preds = predict(X_test_dfs[i], X_train_dfs[i], y_train_dfs[i])
    my_preds_acc_temp = accuracy_score(y_test_dfs[i], my_preds)
    my_preds_acc.append(my_preds_acc_temp);

In [34]:
my_preds_acc

[0.4990059642147117,
 0.40159045725646125,
 0.4262948207171315,
 0.4671968190854871,
 0.4393638170974155]

In [35]:
mean(my_preds_acc)

0.4466903756742414

## 2. PCA

# Outlier Removal

In [36]:
from scipy import stats

In [37]:
import numpy as np

In [38]:
z=np.abs(stats.zscore(new_X_train_df))

In [42]:
threshold = 3
print(np.where(z > 3))

(array([ 0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,
        2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,
        4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  7,  7,
        7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,
        9,  9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 12,
       12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
       14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16,
       17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19,
       19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
       21, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 24, 24,
       24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 26, 26]), array([1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1,
       2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2,
       3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6

In [43]:
new_X_train_df_o = new_X_train_df[(z < 3).all(axis=1)]

In [44]:
new_X_train_df.shape

(2514, 10)

In [45]:
new_X_train_df_o.shape

(2487, 10)

# Sequential Background Selection Method

In [48]:
def simple_crit_func(feat_sub):
    """ Returns sum of numerical values of an input list. """ 
    return sum(feat_sub)

# Example:
simple_crit_func([1,2,4])

7

In [49]:
from copy import deepcopy

def seq_backw_select(features, max_k, criterion_func, print_steps=False):
    """
    Implementation of a Sequential Backward Selection algorithm.
    
    Keyword Arguments:
        features (list): The feature space as a list of features.
        max_k: Termination criterion; the size of the returned feature subset.
        criterion_func (function): Function that is used to evaluate the
            performance of the feature subset.
        print_steps (bool): Prints the algorithm procedure if True.
        
    Returns the selected feature subset, a list of features of length max_k.

    """
    # Initialization
    feat_sub = deepcopy(features)
    k = len(feat_sub)
    i = 0

    while True:
        
        # Exclusion step
        if print_steps:
            print('\nExclusion from feature subset', feat_sub)
        worst_feat = len(feat_sub)-1
        worst_feat_val = feat_sub[worst_feat]
        crit_func_max = criterion_func(feat_sub[:-1]) 

        for i in reversed(range(0,len(feat_sub)-1)):
            crit_func_eval = criterion_func(feat_sub[:i] + feat_sub[i+1:])
            if crit_func_eval > crit_func_max:
                worst_feat, crit_func_max = i, crit_func_eval
                worst_feat_val = feat_sub[worst_feat]
        del feat_sub[worst_feat]
        if print_steps:
            print('exclude: {} -> feature subset: {}'.format(worst_feat_val, feat_sub))
        
        # Termination condition
        k = len(feat_sub)
        if k == max_k:
            break
                
    return feat_sub

In [50]:
def example_seq_backw_select():
    ex_features = [1,2,3,4,5,6,7,8,9,10]
    res_backw = seq_backw_select(features=ex_features, max_k=3,\
                                 criterion_func=simple_crit_func, print_steps=True)  
    return (res_backw)
    
# Run example
res_backw = example_seq_backw_select()
print('\nRESULT: [6, 3, 1, 6, 8, 2, 3, 7, 9, 1] ->', res_backw)


Exclusion from feature subset [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
exclude: 1 -> feature subset: [2, 3, 4, 5, 6, 7, 8, 9, 10]

Exclusion from feature subset [2, 3, 4, 5, 6, 7, 8, 9, 10]
exclude: 2 -> feature subset: [3, 4, 5, 6, 7, 8, 9, 10]

Exclusion from feature subset [3, 4, 5, 6, 7, 8, 9, 10]
exclude: 3 -> feature subset: [4, 5, 6, 7, 8, 9, 10]

Exclusion from feature subset [4, 5, 6, 7, 8, 9, 10]
exclude: 4 -> feature subset: [5, 6, 7, 8, 9, 10]

Exclusion from feature subset [5, 6, 7, 8, 9, 10]
exclude: 5 -> feature subset: [6, 7, 8, 9, 10]

Exclusion from feature subset [6, 7, 8, 9, 10]
exclude: 6 -> feature subset: [7, 8, 9, 10]

Exclusion from feature subset [7, 8, 9, 10]
exclude: 7 -> feature subset: [8, 9, 10]

RESULT: [6, 3, 1, 6, 8, 2, 3, 7, 9, 1] -> [8, 9, 10]


In [51]:
def predict1(X_test, X_train, y_train,feat_sub):
    feat_sub1 = deepcopy(feat_sub)
    X_test_copy=X_test.iloc[feat_sub1]
    X_train_copy=X_train.iloc[feat_sub1]
    num_examples, num_features, num_classes = get_params(X_test_copy, y_train)
    class_mean, class_std, class_prior = get_stats_by_class(X_train_copy, y_train)
    probs = class_probabilities(X_test_copy, class_mean, class_std, class_prior)
    return np.argmax(probs, 1)

In [None]:
my_preds = predict(new_X_test_df, new_X_train_df, new_y_train_df,)

In [100]:
new_X_train_df_copy=new_X_train_df

In [101]:
ex_features = [1,2,3,4,5,6,7,8,9,10]

In [109]:
new_X_train_df.iloc[:,ex_features]

IndexError: positional indexers are out-of-bounds

In [77]:
new_X_train_df_copy.iloc['countyname']

TypeError: Cannot index by location index with a non-integer key

In [54]:
new_X_train_df_copy.shape

(2514, 10)

In [56]:
new_X_train_df_copy.tail()

Unnamed: 0,countyfips,predicted_deaths_by_october_06,predicted_deaths_by_october_07,predicted_deaths_by_october_08,predicted_deaths_by_october_09,predicted_deaths_by_october_10,predicted_deaths_by_october_11,predicted_deaths_by_october_12,countyname_cat,statename_cat
3135,41069,0.1,0.3,0.5,0.7,1.0,1.3,1.7,1492,36
3136,2100,0.1,0.3,0.5,0.7,0.9,1.2,1.6,583,0
3137,20141,0.1,0.3,0.4,0.6,0.8,1.1,1.4,1031,15
3139,21237,0.1,0.3,0.4,0.6,0.8,1.1,1.4,1520,16
3140,49009,0.1,0.3,0.5,0.7,1.0,1.4,1.9,369,43
