In [78]:
import csv
import random
import math
from google.colab import files

# Upload the file
uploaded = files.upload()

# Load CSV function
def loadCsv(filename):
    with open(filename, newline='') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        # Skip the header row
        dataset = dataset[1:]
        for i in range(len(dataset)):
            dataset[i] = [float(x) for x in dataset[i]]
    return dataset

# Assuming you uploaded the file `diabetes.csv`
filename = list(uploaded.keys())[0]  # Get the uploaded file name
dataset = loadCsv(filename)  # Load the dataset
print(f"Loaded dataset with {len(dataset)} rows")


Saving diabetes.csv to diabetes (5).csv
Loaded dataset with 768 rows


In [101]:
# Print the first 5 rows of the dataset
for row in dataset[:5]:
    print(row)


[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0]
[8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0]
[1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0]
[0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 1.0]


In [80]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [81]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if vector[-1] not in separated:
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated


In [82]:
def mean (numbers):
  return sum(numbers)/float(len(numbers))

In [92]:
# def stdev(numbers):
#   avg = mean (numbers)
#   variance=sum([pow(x-avg, 2) for x in numbers])/float(len(numbers)-1)
#   return math.sqrt(variance)


def stdev(numbers):
    if len(numbers) < 2:
        return 0  # Return 0 when the variance cannot be calculated
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(variance)


In [93]:
def summarize(dataset):
  summaries = [(mean (attribute), stdev(attribute)) for attribute in zip(*dataset)]
  del summaries[-1]
  return summaries

In [94]:
def summarizeByClass(dataset):
  separated=separateByClass (dataset)
  summaries={}
  for classValue, instances in separated.items():
    summaries[classValue]=summarize(instances)
    return summaries


In [95]:
# def calculateProbability(x, mean, stdev):
#   exponent=math.exp(-(math.pow(x-mean,2)/(2 *math.pow(stdev,2))))
#   return(1/(math.sqrt(2 *math.pl)*stdev))*exponent

def calculateProbability(x, mean, stdev):
    if stdev == 0:
        return 1.0 if x == mean else 0.0  # Handle the zero variance case
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent


In [96]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities


In [97]:

def predict(summaries, inputVector):
  probabilities=calculateClassProbabilities (summaries, inputVector)
  bestLabel, bestProb =None, -1
  for classValue, probability in probabilities.items():
    if bestLabel is None or probability > bestProb:
      bestProb =probability
      bestLabel =classValue
    return bestLabel


In [98]:
def getPredictions (summaries, testSet):
  predictions=[]
  for i in range(len(testSet)):
    result= predict (summaries, testSet[i])
    predictions.append(result)
  return predictions

In [99]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct / float(len(testSet))) * 100.0


In [100]:
def main():
  filename='diabetes.csv'
  splitRatio =0.67
  dataset =loadCsv(filename)
  trainingSet, testSet =splitDataset (dataset, splitRatio)
  print('Split {} rows into train {} and test {} rows'.format(len(dataset), len(trainingSet), len(testSet)))

  summaries =summarizeByClass (trainingSet)
  #test model
  predictions =getPredictions (summaries, testSet)
  accuracy= getAccuracy (testSet, predictions)
  print('Accuracy: {:.2f}%'.format(accuracy))
main()



Split 768 rows into train 514 and test 254 rows
Accuracy: 62.99%
