#Pouya Shaeri
#400422105
This is **part 2** of the Data Mining Project

# **TASK 0** : Reading Data and Preprocess

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:
df_original = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Data Mining/Project2/Part2/Dataset/heart.csv')
df = df_original.copy()
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

**Removing Outlier Records by Z-Score**

In [5]:
# Remove outlier instances (considering the Numerical features)
# An instance is outlier if it's value is 3*std higher than mean (z-score = 3)
for col in df.columns:
  if df[col].dtype == 'int64' or df[col].dtype == 'float64':
    print('before', col, len(df), df[col].mean(), df[col].std())
    upper_range = df[col].mean() + 3 * df[col].std()
    lower_range = df[col].mean() - 3 * df[col].std()

    df = df[(df[col]>= lower_range) & (df[col]<= upper_range)]

    print('after', col, len(df), df[col].mean(), df[col].std())


before age 303 54.366336633663366 9.082100989837858
after age 303 54.366336633663366 9.082100989837858
before sex 303 0.6831683168316832 0.4660108233396251
after sex 303 0.6831683168316832 0.4660108233396251
before cp 303 0.966996699669967 1.0320524894832992
after cp 303 0.966996699669967 1.0320524894832992
before trestbps 303 131.62376237623764 17.53814281351709
after trestbps 301 131.19601328903656 16.784460592657993
before chol 301 246.0033222591362 51.90340376371416
after chol 297 243.26599326599327 45.881530263983336
before fbs 297 0.1447811447811448 0.352473934125454
after fbs 297 0.1447811447811448 0.352473934125454
before restecg 297 0.5387205387205387 0.52570645456117
after restecg 297 0.5387205387205387 0.52570645456117
before thalach 297 149.4747474747475 22.951186494535786
after thalach 296 149.73986486486487 22.52990250978288
before exang 296 0.3277027027027027 0.47017067884642416
after exang 296 0.3277027027027027 0.47017067884642416
before oldpeak 296 1.0192567567567568 

# **TASK 2 & 3**

In [6]:
import math

ATTR_NAMES = ["thalach", "trestbps", "chol"]
FIELD_NAMES = ["Num"] + ATTR_NAMES + ["target"]

class GNB_classifier(object):

    def __init__(self, training_set, test_set):
        self.__training_set = training_set
        self.__test_set = test_set
        self.__n = len(self.__training_set)
        self.__prior()
        self.__calculate_mean_variance()

    def __prior(self):
        counts = self.__training_set["target"].value_counts().to_dict()
        self.__priors = {(k, v / self.__n) for k, v in counts.items()}

    def __calculate_mean_variance(self):
        self.__mean_variance = {}
        for c in self.__training_set["target"].unique():
            filtered_set = self.__training_set[
                (self.__training_set['target'] == c)]
            m_v = {}
            for attr_name in ATTR_NAMES:
                m_v[attr_name] = []
                m_v[attr_name].append(filtered_set[attr_name].mean())
                m_v[attr_name].append(
                    math.pow(filtered_set[attr_name].std(), 2))
            self.__mean_variance[c] = m_v

    @staticmethod
    def __calculate_probability(x, mean, variance):
        exponent = math.exp(-(math.pow(x - mean, 2) / (2 * variance)))
        return (1 / (math.sqrt(2 * math.pi * variance))) * exponent

    def predict(self):
        predictions = {}
        for _, row in self.__test_set.iterrows():
            results = {}
            for k, v in self.__priors:
                p = 0
                for attr_name in ATTR_NAMES:
                    prob = self.__calculate_probability(row[attr_name], self.__mean_variance[
                        k][attr_name][0], self.__mean_variance[k][attr_name][1])
                    if prob > 0:
                        p += math.log(prob)
                results[k] = math.log(v) + p
            predictions[int(row._name)] = max([key for key in results.keys() if results[
                key] == results[max(results, key=results.get)]])
        return predictions

    def print_info(self):
        print("Priors for each class: ", self.__priors)
        print("Means and variance for each class: ", self.__mean_variance)

        
def calculate_accuracy(test_set, predictions):
    correct = 0
    for _, t in test_set.iterrows():
        if t["target"] == predictions[t._name]:
            correct += 1
    return (correct / len(test_set)) * 100.0

In [7]:
from sklearn.model_selection import train_test_split

y = df.target.values
x_train, x_test, y_train, y_test = train_test_split(df,y,test_size = 0.2,random_state=42)

In [8]:
classifier = GNB_classifier(x_train, x_test)
classifier.print_info()

Priors for each class:  {(1, 0.537117903930131), (0, 0.462882096069869)}
Means and variance for each class:  {1: {'thalach': [158.609756097561, 386.3054778088765], 'trestbps': [129.08130081300814, 251.48513927762207], 'chol': [244.7560975609756, 2221.317073170731]}, 0: {'thalach': [139.78301886792454, 479.3715184186884], 'trestbps': [134.06603773584905, 324.5384546271338], 'chol': [247.10377358490567, 2161.6557951482478]}}


In [9]:
predictions = classifier.predict()
print("Predictions in the form (number, predicted class): ", predictions)

Predictions in the form (number, predicted class):  {9: 1, 269: 0, 148: 1, 224: 0, 241: 0, 205: 1, 101: 0, 75: 1, 113: 1, 34: 0, 143: 1, 216: 0, 147: 1, 282: 0, 5: 1, 234: 0, 249: 1, 268: 0, 46: 1, 285: 0, 250: 0, 58: 1, 159: 1, 160: 1, 117: 1, 112: 0, 156: 1, 150: 0, 115: 1, 43: 1, 116: 1, 151: 0, 180: 0, 222: 1, 47: 1, 84: 0, 65: 1, 122: 1, 77: 1, 172: 1, 152: 0, 31: 1, 22: 1, 24: 1, 191: 0, 267: 0, 87: 1, 183: 1, 284: 0, 97: 1, 214: 1, 179: 0, 185: 1, 171: 1, 6: 1, 79: 1, 201: 1, 62: 1}


In [10]:
accuracy = calculate_accuracy(x_test, predictions)
print("Accuracy : ", accuracy)

Accuracy :  72.41379310344827


In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_pred = [ value for key,value in predictions.items()]

print("Precision:")
my_p_score = precision_score(y_test, y_pred)  
my_p_score * 100

Precision:


75.67567567567568

In [12]:
print("Recall:")
my_recall_score = recall_score(y_test, y_pred)
my_recall_score * 100

Recall:


80.0

In [13]:
print("F1 Score:")
my_f1_score = f1_score(y_test, y_pred)
my_f1_score * 100

F1 Score:


77.77777777777779

# **TASK 4 & 5**

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

x = df[["thalach", "trestbps", "chol"]]
y = df.target.values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state=0)

model = GaussianNB()
model.fit(x_train, y_train)
sk_y_pred = model.predict(x_test)

In [15]:
accuracy = accuracy_score(y_test, sk_y_pred)
print("Accuracy : ", accuracy * 100)

Accuracy :  72.41379310344827


In [16]:
print("Precision:")
p_score = precision_score(y_test, sk_y_pred)  
print(p_score * 100)

print("\nMy Precision:")
print(my_p_score * 100)

Precision:
67.56756756756756

My Precision:
75.67567567567568


In [17]:
print("Recall:")
recall_score = recall_score(y_test, sk_y_pred)
print(recall_score * 100)

print("\nMy Recall:")
print(my_recall_score * 100)

Recall:
86.20689655172413

My Recall:
80.0


In [18]:
print("F1 Score:")
f1_score = f1_score(y_test, sk_y_pred)
print(f1_score * 100)

print("\nMy F1 Score:")
print(my_f1_score * 100)

F1 Score:
75.75757575757575

My F1 Score:
77.77777777777779
