In [1]:
import math
import random
import time

In [7]:
def read_csv(file_name, num_rows_to_read=None):
    data = []
    with open(file_name, 'r') as f:
        lines = f.readlines()[1:]  
        for i, line in enumerate(lines):
            if num_rows_to_read is not None and i >= num_rows_to_read:
                break
            row = float(line.strip())
            data.append(row)

    return data

def euclidean_distance(x1, x2):
    return abs(x1 - x2)

def knn(data, k):
    data.sort()
    outliers = []
    for i in range(len(data)):
        distances = [(euclidean_distance(data[i], data[j]), j) for j in range(len(data)) if j != i]
        distances.sort(key=lambda x: x[0])
        if distances[k-1][0] > threshold:
            outliers.append(data[i])

    return outliers

def remove_outliers(dataset, outliers):
    for number in outliers:
        while number in dataset:
            index = dataset.index(number)
            dataset[index] = None

In [8]:
def introduce_missingness(data, missingness_percentage):
    num_missing = int(len(data) * missingness_percentage / 100)
    missing_indices = []
    while len(missing_indices) < num_missing:
        r = random.randint(0, len(data)-1)
        if r not in missing_indices:
            missing_indices.append(r)
    for i in missing_indices:
        data[i] = None
    return data

def calculate_mean_variance(data, weights):
    n = len(data)
    mean = sum(w*x for x, w in zip(data, weights)) / n
    variance = sum(w*(x-mean)**2 for x, w in zip(data, weights)) / n
    return mean, variance

def em_imputation(data, num_iterations):
    non_missing_data = [x for x in data if x is not None]
    mean, variance = calculate_mean_variance(non_missing_data, [1] * len(non_missing_data))

    for _ in range(num_iterations):
        estimated_data = [x if x is not None else mean for x in data]
        mean, variance = calculate_mean_variance(estimated_data, [1] * len(estimated_data))

    imputed_data = [x if x is not None else mean for x in data]
    return imputed_data

In [9]:
def calculate_rmse(original_data, imputed_data):
    n = len(original_data)
    squared_errors = [(original - imputed)**2 for original, imputed in zip(original_data, imputed_data)]
    mean_squared_error = sum(squared_errors) / n
    rmse = math.sqrt(mean_squared_error)
    return rmse

def calculate_mae(original_data, imputed_data):
    n = len(original_data)
    absolute_errors = [abs(original - imputed) for original, imputed in zip(original_data, imputed_data)]
    mae = sum(absolute_errors) / n
    return mae

def calculate_mape(original_data, imputed_data):
    n = len(original_data)
    percentage_errors = [abs((original - imputed) / original) for original, imputed in zip(original_data, imputed_data) if original != 0]
    mape = sum(percentage_errors) / len(percentage_errors) * 100
    return mape

In [10]:
batch_size=20
original_data = read_csv('dataset1.csv',batch_size)
missingness_percentage=20
num_iterations = 50

raw_data=original_data
start_time = time.monotonic()
k = 3
threshold = 2.0  
outliers = knn(raw_data, k)
remove_outliers(raw_data,outliers)

data_with_missingness = introduce_missingness(raw_data[:], missingness_percentage)
imputed_data = em_imputation(data_with_missingness, num_iterations)

end_time = time.monotonic()
elapsed_time_seconds = end_time - start_time
elapsed_time_ms = elapsed_time_seconds * 1000
rmse = calculate_rmse(original_data, imputed_data)
mae = calculate_mae(original_data, imputed_data)
time=elapsed_time_ms

In [11]:
print(f'Execution time: {time}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

Execution time: 0.0
RMSE: 0.0061015440244278655
MAE: 0.0025265437499999875
