In [108]:
import argparse
import numpy as np
import pandas as pd
from scipy import signal
from sklearn.model_selection import KFold
from sklearn.utils import resample

def model_to_string(coeffs, m, n):
    ones = f"{coeffs[0]} "
    cosines = [f"{coeffs[j]:+} cos(2π * {j})" for j in range(1, m + 1)]
    sines = [f"{coeffs[j]:+} sin(2π * {j - m})" for j in range(m + 1, m + n + 1)]
    return ones + " ".join(cosines) + " " + " ".join(sines)

In [109]:
def fit_model(data, m, n):
    x_sample = data['x'].values.reshape(-1, 1)
    y_sample = data['y'].values.reshape(-1, 1)
    ones = np.ones_like(x_sample)
    cosines = np.array([np.cos(2 * np.pi * j * x_sample) for j in range(1, m + 1)])[:, :, 0].T
    sines = np.array([np.sin(2 * np.pi * j * x_sample) for j in range(1, n + 1)])[:, :, 0].T
    dmatrix = np.concatenate([ones, cosines, sines], axis=1)

    coeffs = np.linalg.lstsq(dmatrix, y_sample, rcond=None)[0]
    return coeffs

In [110]:
def calculate_rmse(data, coeffs, m, n):
    x_sample = data['x'].values.reshape(-1, 1)
    y_sample = data['y'].values.reshape(-1, 1)
    ones = np.ones_like(x_sample)
    cosines = np.array([np.cos(2 * np.pi * j * x_sample) for j in range(1, m + 1)])[:, :, 0].T
    sines = np.array([np.sin(2 * np.pi * j * x_sample) for j in range(1, n + 1)])[:, :, 0].T
    dmatrix = np.concatenate([ones, cosines, sines], axis=1)

    outputs = np.dot(dmatrix, coeffs)
    resids = y_sample - outputs
    rmse = np.sqrt(np.mean(np.square(resids.reshape(-1))))
    return rmse

In [111]:
def aic(num_samples, rmse, m, n):
    k = m + n + 1
    rss = num_samples * (rmse ** 2)
    aic = 2 * k + num_samples * np.log(rss/num_samples)
    return aic


In [112]:
# def k_fold_cross_validation(data, m, n, k):
#     kf = KFold(n_splits=k)
#     rmses = []
#     for train_index, test_index in kf.split(data):
#         train_data = data.iloc[train_index]
#         test_data = data.iloc[test_index]
#         coeffs = fit_model(train_data, m, n)
#         rmse = calculate_rmse(test_data, coeffs, m, n)
#         rmses.append(rmse)
#     return np.mean(rmses)

In [113]:
def k_fold_cross_validation(data, m, n, k):
    kf = KFold(n_splits=k)
    rmses = []
    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index]
        test_data = data.iloc[test_index]
        coeffs = fit_model(train_data, m, n)
        rmse = calculate_rmse(test_data, coeffs, m, n)
        rmses.append(rmse)
    return np.mean(rmses)

In [114]:
def bootstrap(data, m, n, num_bootstraps):
    rmses = []
    for _ in range(num_bootstraps):
        boot_data = resample(data, replace=True)
        coeffs = fit_model(boot_data, m, n)
        rmse = calculate_rmse(data, coeffs, m, n)
        rmses.append(rmse)
    return np.mean(rmses)

In [115]:
def generate_data(size, noise, output_file):
    rng = np.random.default_rng()
    x_sample = rng.uniform(-10, 10, size)
    noise = rng.normal(0, noise, size)
    offset = rng.uniform(1)
    y_sample = signal.sawtooth(2 * np.pi * x_sample + offset) + noise

    df = pd.DataFrame({'x': x_sample, 'y': y_sample})
    df.to_csv(output_file, index=False)

In [116]:
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("m", help="Number of cosine terms", type=int)
    parser.add_argument("n", help="Number of sine terms", type=int)
    parser.add_argument("-s", "--size", help="Sample size", type=int, default=10000)
    parser.add_argument("-N", "--noise", help="Magnitude of the noise", type=float, default=0.1)
    parser.add_argument("-f", "--input_file", help="Name of input data file", default="sample.csv")
    args = parser.parse_args()

    # Generate new data
    generate_data(args.size, args.noise, args.input_file)

    data = pd.read_csv(args.input_file)

    # Fit model
    coeffs = fit_model(data, args.m, args.n)
    model_stringified = model_to_string(coeffs.reshape(-1), args.m, args.n)
    print("Model:", model_stringified)

    # Calculate RMSE
    rmse = calculate_rmse(data, coeffs, args.m, args.n)
    print("RMSE:", rmse)
    
    #calculate AIC
    aic = calculate_aic(data, coeffs, args.m, args.n)
    print("AIC:", aic)

    # K-fold cross-validation (k = 5)
    k_fold_rmse_5 = k_fold_cross_validation(data, args.m, args.n, 5)
    print("5-Fold Cross Validation RMSE:", k_fold_rmse_5)

    # K-fold cross-validation (k = 10)
    k_fold_rmse_10 = k_fold_cross_validation(data, args.m, args.n, 10)
    print("10-Fold Cross Validation RMSE:", k_fold_rmse_10)

    # Bootstrapping
    num_bootstraps = 100
    bootstrap_rmse = bootstrap(data, args.m, args.n, num_bootstraps)
    print("Bootstrap RMSE:", bootstrap_rmse)

if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] [-s SIZE] [-N NOISE] [-f INPUT_FILE] m n
ipykernel_launcher.py: error: the following arguments are required: m, n


SystemExit: 2

In [1]:
%run ML_assignment2_test.py 3 4



Model: 9.137915627610373e-05 -0.5343737727809562 cos(2π * 1) -0.28657047067960384 cos(2π * 2) -0.02779117920390658 cos(2π * 3) -0.3436972035176224 sin(2π * 1) +0.1362547880415891 sin(2π * 2) +0.20835835723116083 sin(2π * 3) +0.10143087088253239 sin(2π * 4)
RMSE: 0.2529675849412099
AIC: -9095.107760407647
5-Fold Cross Validation RMSE: 0.25311532279515425
10-Fold Cross Validation RMSE: 0.2531815604374595
Bootstrap RMSE: 0.2530624579458039


In [2]:
%run hamzaa_code.py 3 4
