In [1]:
import math
import random
import time

In [2]:
def read_csv(file_name, num_rows_to_read=None):
    data = []
    with open(file_name, 'r') as f:
        lines = f.readlines()[1:]  
        for i, line in enumerate(lines):
            if num_rows_to_read is not None and i >= num_rows_to_read:
                break  

            row = float(line.strip())
            data.append(row)

    return data

def calculate_rmse(original_data, imputed_data):
    n = len(original_data)
    squared_errors = [(original - imputed)**2 for original, imputed in zip(original_data, imputed_data)]
    mean_squared_error = sum(squared_errors) / n
    rmse = math.sqrt(mean_squared_error)
    return rmse

def calculate_mae(original_data, imputed_data):
    n = len(original_data)
    absolute_errors = [abs(original - imputed) for original, imputed in zip(original_data, imputed_data)]
    mae = sum(absolute_errors) / n
    return mae

In [3]:
def introduce_missingness(data, missingness_percentage):
    num_missing = int(len(data) * missingness_percentage / 100)
    missing_indices = []
    while len(missing_indices) < num_missing:
        r = random.randint(0, len(data)-1)
        if r not in missing_indices:
            missing_indices.append(r)
    for i in missing_indices:
        data[i] = 0
    return data

In [4]:
def moving_average(data, window_size):
    return [sum([x for x in data[i:i+window_size] if x != 0])/window_size for i in range(len(data) - window_size + 1)]

def standard_deviation(data, window_size):
    avg = moving_average(data, window_size)
    variance = [sum([(x - avg[i])**2 for x in data[i:i+window_size] if x != 0])/window_size for i in range(len(data) - window_size + 1)]
    return [var**0.5 for var in variance]

def detect_outliers(data, window_size, z_thresh):
    outliers = []
    avg = moving_average(data, window_size)
    std_dev = standard_deviation(data, window_size)
    
    for i in range(len(data) - window_size + 1):
        if data[i + window_size - 1] != 0 and abs(data[i + window_size - 1] - avg[i]) > z_thresh * std_dev[i]:
            outliers.append(i + window_size - 1)
            data[i + window_size - 1] = 0
    return outliers

In [5]:
def SLR_impute(data):
    known_data = [(i, d) for i, d in enumerate(data) if d != 0]
    missing_indices = [i for i, d in enumerate(data) if d == 0]

    if not known_data or not missing_indices:
        return data

    x_known, y_known = zip(*known_data)

    # Compute coefficients for linear regression
    n = len(x_known)
    m_x, m_y = sum(x_known) / n, sum(y_known) / n
    ss_xy = sum(y_known[i] * x_known[i] for i in range(n)) - n * m_y * m_x
    ss_xx = sum(x_known[i] * x_known[i] for i in range(n)) - n * m_x * m_x
    b_1 = ss_xy / ss_xx
    b_0 = m_y - b_1 * m_x

    for i in missing_indices:
        data[i] = b_0 + b_1 * i

    return data

In [6]:
batch_size=20
original_data = read_csv('dataset1.csv',batch_size)
missingness_percentage=20
raw_data = introduce_missingness(original_data[:], missingness_percentage)
start_time = time.monotonic()
data = [float(item) for item in raw_data]
window_size = 5
z_thresh = 2
outliers = detect_outliers(data, window_size, z_thresh)
imputed_data = SLR_impute(data)
end_time = time.monotonic()
elapsed_time_seconds = end_time - start_time
elapsed_time_ms = elapsed_time_seconds * 1000
rmse = calculate_rmse(original_data, imputed_data)
mae = calculate_mae(original_data, imputed_data)
time=elapsed_time_ms

In [7]:
print(f'Execution time: {time}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

Execution time: 0.0
RMSE: 0.11582408921292103
MAE: 0.04079530111386163
