In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

## Constants

In [None]:
#DATA_PATH = '../data' # Need to have the data as CSV files in this path
DATA_PATH = '../data/mini' # Need to have the data as CSV files in this path

## Function definitions

In [None]:
def load_source_data(path):
    content = sorted(filter(lambda x: x.endswith(".csv"), os.listdir(path)))
    df = pd.concat((pd.read_csv(f'{path}/{f}') for f in content))
    return df

In [None]:
def preprocess(df):
    # Drop empty features (dataset v. 1.0.0): unspsc_code, label 
    df = df.drop(['label', 'unspsc_code'], axis=1)

    # Drop co2_total (target)
    df = df.drop(['co2_total'], axis=1)

    # Use ordered categories for size
    size_type = CategoricalDtype(categories=["XS", "S", "M", "L", "XL", "XXL"], ordered=True)
    df["size"] = df["size"].astype(size_type)
    df_weight = df[~df["weight"].isna()]
    y = df_weight["weight"]
    df_weight = df_weight.drop(["weight"], axis=1)
    
    # Convert the categoricals into a one-hot vector of binary variables
    X = pd.get_dummies(df_weight)
    
    # Fill in 0 for NA in ftp_ columns
    X = X.fillna(0)
    return X, y

In [None]:
def searchK():    
    number_to_try = 12
    losses = np.zeros((number_to_try+1), dtype=np.float64)
    for n in range(8,number_to_try+1):
        knn = neighbors.KNeighborsRegressor(n, algorithm='ball_tree', weights='uniform')
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        # Use simple RMSE
        elementwise_loss = np.sqrt(np.square(y_test-y_pred))
        losses[n] = np.sum(elementwise_loss)

    ax = sns.lineplot(x=range(number_to_try+1), y=losses).set_title("KNN, number of neighbors vs loss")

In [None]:
def train_weight_predictor(X_train, y_train, n_neighbors=9):
    knn = neighbors.KNeighborsRegressor(n_neighbors, algorithm='ball_tree', weights='uniform')
    knn.fit(X_train, y_train)
    return knn

In [None]:
def save_model(model, filename):
    path = f"{DATA_PATH}/{filename}"
    joblib.dump(model, path)
    print(f"Saved model to disk at {path}")
    return

def load_model(filename):
    path = f"{DATA_PATH}/{filename}"
    knn = joblib.load(path)
    print(f"Loaded model from disk at {path}")
    return knn

In [None]:
def evaluate(model, X_test, y_test):
    y_pred = knn.predict(X_test)

    rmse = mean_squared_error(y, y_pred, squared=False)
    r2 = r2_score(y, y_pred)
    
    print("RMSE Score:", rmse)
    print("R^2 Score:", r2)

    return rmse

## Pipeline for training weight estimator

In [None]:
df = load_source_data(path=DATA_PATH)
# If needed, use this line to use only top 100 000 rows for experimentation
df = df[:100000]

X, y = preprocess(df)

# Train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = train_weight_predictor(X_train, y_train)
rmse = evaluate(knn, X_test, y_test)

In [None]:
filename = f"nearestneighbor_weight_regression_{n_neighbors}.sav"
# save_model(knn, filename)
# load_model(filename)

## Predict weight values

In [None]:
df2 = load_source_data(path=DATA_PATH)
samples = df2[df2["weight"].isna()]

In [None]:
samples_pred_weight = knn.predict(samples)

In [None]:
samples_pred_weight[0:10]