In [1]:
import pandas as pd
import math
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# memuat data hepatitis
data = pd.read_csv('hepatitis_data.csv')

# mengisi mising Values dengan mean masing-masing kolom
Class = data["class"].mean()
data["class"] = data["class"].fillna(Class)
data["class"].isna().sum()

Age = data["age"].mean()
data["age"] = data["age"].fillna(Age)
data["age"].isna().sum()

Sex = data["sex"].mean()
data["sex"] = data["sex"].fillna(Sex)
data["sex"].isna().sum()

Steroid = data["steroid"].mean()
data["steroid"] = data["steroid"].fillna(Steroid)
data["steroid"].isna().sum()

Antivirals = data["antivirals"].mean()
data["antivirals"] = data["antivirals"].fillna(Antivirals)
data["antivirals"].isna().sum()

Fatigue = data["fatigue"].mean()
data["fatigue"] = data["fatigue"].fillna(Fatigue)
data["fatigue"].isna().sum()

Malaise = data["malaise"].mean()
data["malaise"] = data["malaise"].fillna(Malaise)
data["malaise"].isna().sum()

Anorexia = data["anorexia"].mean()
data["anorexia"] = data["anorexia"].fillna(Anorexia)
data["anorexia"].isna().sum()

Liver_big = data["liver big"].mean()
data["liver big"] = data["liver big"].fillna(Liver_big)
data["liver big"].isna().sum()

Liver_firm = data["liver firm"].mean()
data["liver firm"] = data["liver firm"].fillna(Liver_firm)
data["liver firm"].isna().sum()

Spleen_palpable = data["spleen palpable"].mean()
data["spleen palpable"] = data["spleen palpable"].fillna(Spleen_palpable)
data["spleen palpable"].isna().sum()

Spiders = data["spiders"].mean()
data["spiders"] = data["spiders"].fillna(Spiders)
data["spiders"].isna().sum()

Ascites = data["ascites"].mean()
data["ascites"] = data["ascites"].fillna(Ascites)
data["ascites"].isna().sum()

Varices = data["varices"].mean()
data["varices"] = data["varices"].fillna(Varices)
data["varices"].isna().sum()

Bilirubin = data["bilirubin"].mean()
data["bilirubin"] = data["bilirubin"].fillna(Bilirubin)
data["bilirubin"].isna().sum()

Alk_phosphate = data["alk phosphate"].mean()
data["alk phosphate"] = data["alk phosphate"].fillna(Alk_phosphate)
data["alk phosphate"].isna().sum()

Sgot = data["sgot"].mean()
data["sgot"] = data["sgot"].fillna(Sgot)
data["sgot"].isna().sum()

Albumin = data["albumin"].mean()
data["albumin"] = data["albumin"].fillna(Albumin)
data["albumin"].isna().sum()

Protime = data["protime"].mean()
data["protime"] = data["protime"].fillna(Protime)
data["protime"].isna().sum()

Histology = data["histology"].mean()
data["histology"] = data["histology"].fillna(Histology)
data["histology"].isna().sum()

# fungsi untuk membagi dataset menjadi data latih dan data uji
def train_test_split(dataset, split_ratio):
    split_index = int(split_ratio * len(dataset))
    train_data = dataset[:split_index]
    test_data = dataset[split_index:]
    return train_data, test_data

# fungsi untuk menghitung jumlah kemunculan setiap kelas
def count_classes(dataset):
    classes = {}
    for row in dataset:
        label = row[0]
        if label not in classes:
            classes[label] = 0
        classes[label] += 1
    return classes

# fungsi untuk menghitung probabilitas masing-masing kelas
def calculate_class_probabilities(dataset):
    class_counts = count_classes(dataset)
    total_samples = sum(class_counts.values())
    class_probabilities = {}
    for class_label, count in class_counts.items():
        class_probabilities[class_label] = count / total_samples
    return class_probabilities

# fungsi untuk menghitung mean dan std dev dari setiap atribut numerik untuk setiap kelas
def summarize_numeric_attributes(dataset):
    summaries = [(mean(attribute), std_dev(attribute)) for attribute in zip(*dataset)]
    del summaries[0]  # Hapus ringkasan untuk label kelas
    return summaries

# fungsi untuk menghitung mean dari sebuah atribut
def mean(numbers):
    numbers = [float(x) for x in numbers if x is not None]
    return sum(numbers) / float(len(numbers)) if len(numbers) > 0 else 0.0

# fungsi untuk menghitung standar deviasi dari sebuah atribut
def std_dev(numbers):
    numbers = [float(x) for x in numbers if x is not None]
    avg = mean(numbers)
    variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1) if len(numbers) > 1 else 0.0
    return variance ** 0.5

# fungsi untuk membagi dataset berdasarkan kelas
def separate_by_class(dataset):
    separated = {}
    for row in dataset:
        class_label = row[0]
        if class_label not in separated:
            separated[class_label] = []
        separated[class_label].append(row)
    return separated

# Fungsi untuk menghitung mean dan std dev untuk setiap atribut numerik untuk setiap kelas
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = {}
    for class_label, instances in separated.items():
        summaries[class_label] = summarize_numeric_attributes(instances)
    return summaries

# fungsi untuk menghitung Gaussian probability density function
def calculate_probability(x, mean, std_dev):
    if std_dev == 0:
        return 0
    exponent = math.exp(-((x - mean) * 2 / (2 * std_dev * 2)))
    return (1 / (math.sqrt(2 * math.pi) * std_dev)) * exponent

# Fungsi untuk memprediksi kelas suatu instance
def predict(instance, class_probabilities, class_summaries):
    probabilities = class_probabilities.copy()
    for class_label, class_attributes in class_summaries.items():
        probability = 1
        for i in range(len(class_attributes)):
            mean, std_dev = class_attributes[i]
            x = float(instance[i])
            probability *= calculate_probability(x, mean, std_dev)
        probabilities[class_label] *= probability
    return max(probabilities, key=probabilities.get)

# fungsi untuk mengevaluasi model
def evaluate_model(test_data, class_probabilities, class_summaries):
    predictions = []
    actual_labels = []
    for instance in test_data:
        actual_label = instance[0]
        predicted_label = predict(instance[1:], class_probabilities, class_summaries)
        actual_labels.append(actual_label)
        predictions.append(predicted_label)
    accuracy = accuracy_score(actual_labels, predictions)
    precision = precision_score(actual_labels, predictions, average='weighted', zero_division=1)
    recall = recall_score(actual_labels, predictions, average='weighted')
    f1 = f1_score(actual_labels, predictions, average='weighted')
    return accuracy, precision, recall, f1

# mengubah dataset menjadi list of lists
dataset = data.values.tolist()

# membagi dataset menjadi data latih dan data uji
train_data, test_data = train_test_split(dataset, split_ratio=0.7)

# menghitung probabilitas kelas
class_probabilities = calculate_class_probabilities(train_data)

# menghitung mean dan std dev untuk setiap atribut numerik untuk setiap kelas
class_summaries = summarize_by_class(train_data)

# evaluasi model
accuracy, precision, recall, f1 = evaluate_model(test_data, class_probabilities, class_summaries)

# menampilkan hasil missing values perkolom
print("HASIL MISSING VALUES PERKOLOM DARI DATA HEPATITIS")
print("Missing values class :", Class)
print("Missing values age :", Age)
print("Missing values sex :", Sex)
print("Missing values steroid :", Steroid)
print("Missing values antivirals :", Antivirals)
print("Missing values fatigue :", Fatigue)
print("Missing values malaise :", Malaise)
print("Missing values anorexia :", Anorexia)
print("Missing values liver_big :", Liver_big)
print("Missing values liver_firm :", Liver_firm)
print("Missing values spleen_palpable :", Spleen_palpable)
print("Missing values spiders :", Spiders)
print("Missing values ascites :", Ascites)
print("Missing values varices :", Varices)
print("Missing values bilirubin :", Bilirubin)
print("Missing values alk_phosphate :", Alk_phosphate)
print("Missing values sgot :", Sgot)
print("Missing values albumin :", Albumin)
print("Missing values protime :", Protime)
print("Missing values histology :", Histology)

# menampilkan kolom yang ada missing valuesnya 
columns_with_func_values = []
for col in data.columns:
    if isinstance(data[col].iloc[0], (int, float)):
        columns_with_func_values.append(col)
print("\nKOLOM YANG ADA NaN")
print(columns_with_func_values)

# menampilkan hasil probabilitas kelas 
for class_label, probability in class_probabilities.items():
    print("\nHASIL PROBABILITAS KELAS")
    print(f"Kelas {class_label} : {probability}")
    
# menampilkan hasil mean dan std dev
for class_label, summaries in class_summaries.items():
    print("\nHASIL MEAN DAN STANDART DEVIASI")
    print(f"kelas {class_label} : {summaries}")
          
# menampilkan hasil evaluasi modelnya
print("\nHASIL EVALUASI MODELNYA")
print("Hasil akurasi dataset hepatitis :", accuracy)
print("Hasil presisi dataset hepatitis :", precision)
print("Hasil recall dataset hepatitis :", recall)
print("Hasil f1-score dataset hepatitis :", f1)

HASIL MISSING VALUES PERKOLOM DARI DATA HEPATITIS
Missing values class : 1.793548387096774
Missing values age : 41.2
Missing values sex : 1.103225806451613
Missing values steroid : 1.5064935064935066
Missing values antivirals : 1.8451612903225807
Missing values fatigue : 1.3506493506493507
Missing values malaise : 1.603896103896104
Missing values anorexia : 1.7922077922077921
Missing values liver_big : 1.8275862068965518
Missing values liver_firm : 1.5833333333333333
Missing values spleen_palpable : 1.8
Missing values spiders : 1.66
Missing values ascites : 1.8666666666666667
Missing values varices : 1.88
Missing values bilirubin : 1.42751677852349
Missing values alk_phosphate : 105.32539682539682
Missing values sgot : 85.89403973509934
Missing values albumin : 3.8172661870503606
Missing values protime : 61.85227272727273
Missing values histology : 1.4516129032258065

KOLOM YANG ADA NaN
['steroid', 'fatigue', 'malaise', 'anorexia', 'liver big', 'liver firm', 'spleen palpable', 'spiders