In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

## Constants

In [2]:
DATA_PATH = '../data' # Need to have the data as CSV files in this path
#DATA_PATH = '../data/mini' # Need to have the data as CSV files in this path

TEST_FILE = '../data/test/test.csv' # Need to have the data as CSV files in this path

## Function definitions

In [3]:
def load_source_data(path):
    content = sorted(filter(lambda x: x.endswith(".csv"), os.listdir(path)))
    df = pd.concat((pd.read_csv(f'{path}/{f}') for f in content))
    return df

In [11]:
def preprocess(df, drop_weight_na=True):
    # Drop empty features (dataset v. 1.0.0): unspsc_code, label 
    df = df.drop(['label', 'unspsc_code'], axis=1)

    # Drop co2_total (target)
    df = df.drop(['co2_total'], axis=1)

    if (drop_weight_na):
        df = df[~df["weight"].isna()]
        
    # Use unordered caterogies for several columns. List category values to support use cases when some
    # values are absent from a batch of source data.
    brand_types = CategoricalDtype(categories=["b0", "b1", "b10", "b100", "b101", "b102", "b103", "b104", "b105", "b106", "b107", "b108", "b109", "b11", "b110", "b111", "b112", "b113", "b114", "b115", "b116", "b117", "b118", "b119", "b12", "b120", "b121", "b122", "b123", "b124", "b125", "b126", "b127", "b128", "b129", "b13", "b130", "b131", "b132", "b133", "b134", "b135", "b136", "b137", "b138", "b139", "b14", "b140", "b141", "b142", "b143", "b144", "b145", "b146", "b147", "b148", "b149", "b15", "b16", "b17", "b18", "b19", "b2", "b20", "b21", "b22", "b23", "b24", "b25", "b26", "b27", "b28", "b29", "b3", "b30", "b31", "b32", "b33", "b34", "b35", "b36", "b37", "b38", "b39", "b4", "b40", "b41", "b42", "b43", "b44", "b45", "b46", "b47", "b48", "b49", "b5", "b50", "b51", "b52", "b53", "b54", "b55", "b56", "b57", "b58", "b59", "b6", "b60", "b61", "b62", "b63", "b64", "b65", "b66", "b67", "b68", "b69", "b7", "b70", "b71", "b72", "b73", "b74", "b75", "b76", "b77", "b78", "b79", "b8", "b80", "b81", "b82", "b83", "b84", "b85", "b86", "b87", "b88", "b89", "b9", "b90", "b91", "b92", "b93", "b94", "b95", "b96", "b97", "b98", "b99"], ordered=False)
    df["brand"] = df["brand"].astype(brand_types)
    cat1_types =  CategoricalDtype(categories=["baby", "clothing", "home", "kidswear", "menswear", "womenswear"], ordered=False)
    df["category-1"] = df["category-1"].astype(cat1_types)
    cat2_types = CategoricalDtype(categories=["home", "footwear", "nightwear", "thermals", "outerwear", "accessory", "uniform", "suit", "swimwear", "headgear", "sportswear", "costume", "clothing", "undergarments", "baby", "dress", "beachwear", "men-undergarments", "hosiery", "women-beachwear", "women-undergarments", "women-sportswear"], ordered=False)
    df["category-2"] = df["category-2"].astype(cat2_types)
    cat3_types = CategoricalDtype(categories=["backpack", "bikin", "body", "boxer-brief", "bra", "brief", "briefs", "cap", "coats", "costume", "curtain", "dress", "evening-dress", "fancy-dress", "flat-cap", "gloves", "hat", "hoodie", "jacket", "jean-shorts", "jeans", "jersey", "knit-cap", "knitwear", "long-sleeved-top", "mat", "overalls", "panties", "pants", "pillow", "pyjama", "scarf", "sheets", "shorts", "skirts", "snow-suit", "socks", "sport-bra", "stockings", "swimsuit", "T-shirt", "tie", "tights", "top", "towel", "trousers", "underpants", "wedding-dress"], ordered=False)
    df["category-3"] = df["category-3"].astype(cat3_types)
    colour_types = CategoricalDtype(categories=["Ivory", "amber", "aquamarine", "black", "blue", "blue gray", "bondi blue", "brown", "colourful", "dark green", "dark grey", "gold", "golden", "gray", "green", "grey", "indigo", "light brown", "light grey", "lime", "maroon", "metal", "mosaic", "mustard", "natural", "navy", "neon", "orange", "peach", "pink", "purple", "red", "silver", "teal", "turquoise", "unbleached", "unknown", "violet", "wheat", "white", "yellow"], ordered=False)
    df["colour"] = df["colour"].astype(colour_types)
    fabric_type_types = CategoricalDtype(categories=["K", "W"], ordered=False)
    df["fabric_type"] = df["fabric_type"].astype(fabric_type_types)
    gender_types = CategoricalDtype(categories=["B", "G", "K", "M", "U", "Y", "W"], ordered=False)
    df["gender"] = df["gender"].astype(gender_types)
    made_in_types = CategoricalDtype(categories=["AU", "BD", "BE", "BG", "BR", "CN", "CO", "CY", "DE", "DK", "EG", "ES", "FI", "FR", "GB", "GE", "GR", "HK", "IE", "IN", "IT", "JP", "KR", "LT", "LV", "ML", "MX", "PK", "RO", "SE", "TH", "TR", "TW", "US", "VE", "VN"], ordered=False)
    df["made_in"] = df["made_in"].astype(made_in_types)
    season_types = CategoricalDtype(categories=["AYR", "MID", "SUM", "WIN"], ordered=False)
    df["season"] = df["season"].astype(season_types)
    
    # Use ordered categories for size
    size_type = CategoricalDtype(categories=["XS", "S", "M", "L", "XL", "XXL"], ordered=True)
    df["size"] = df["size"].astype(size_type)
           
    y = df["weight"]
    df = df.drop(["weight"], axis=1)
    
    # Convert the categoricals into a one-hot vector of binary variables
    X = pd.get_dummies(df)
    
    # Fill in 0 for NA in ftp_ columns
    X = X.fillna(0)
    return X, y

In [5]:
def searchK():    
    number_to_try = 12
    losses = np.zeros((number_to_try+1), dtype=np.float64)
    for n in range(8,number_to_try+1):
        knn = neighbors.KNeighborsRegressor(n, algorithm='ball_tree', weights='uniform')
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        # Use simple RMSE
        elementwise_loss = np.sqrt(np.square(y_test-y_pred))
        losses[n] = np.sum(elementwise_loss)

    ax = sns.lineplot(x=range(number_to_try+1), y=losses).set_title("KNN, number of neighbors vs loss")

In [6]:
def train_weight_predictor(X_train, y_train, n_neighbors=9):
    knn = neighbors.KNeighborsRegressor(n_neighbors, algorithm='ball_tree', weights='uniform')
    knn.fit(X_train, y_train)
    return knn

In [7]:
def save_model(model, filename):
    path = f"{DATA_PATH}/{filename}"
    joblib.dump(model, path)
    print(f"Saved model to disk at {path}")
    return

def load_model(filename):
    path = f"{DATA_PATH}/{filename}"
    knn = joblib.load(path)
    print(f"Loaded model from disk at {path}")
    return knn

In [8]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return rmse, r2

## Pipeline for training weight estimator

In [9]:
def train_model(data_path, samples_to_use=200000, n_neighbors=9):
    df = load_source_data(path=data_path)
    if (samples_to_use < df.shape[0]):
        print(f"Using only the first {samples_to_use} samples of {df.shape[0]} available")
        df = df[:samples_to_use]

    X, y = preprocess(df)

    print(f"X.shape: {X.shape}")
    # Train-test-split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    knn = train_weight_predictor(X_train, y_train, n_neighbors=n_neighbors)
    rmse, r2 = evaluate(knn, X_test, y_test)
    print(f"Trained K-nearest neighbor regressor K={n_neighbors}, training samples={len(y_train)} for estimating weight.")
    print(f"RSME={rmse}, R2={r2}")
    return knn

## Train and save a weight estimator model

In [12]:
n_neighbors=9
filename = f"nearestneighbor_weight_regression_{n_neighbors}.sav"

model = train_model(data_path=DATA_PATH, samples_to_use=100000, n_neighbors=9)
save_model(model, filename)

Using only the first 100000 samples of 15000000 available
X.shape: (40163, 333)
Trained K-nearest neighbor regressor K=9, training samples=32130 for estimating weight.
RSME=0.7147649993971269, R2=-0.027976713602003045
Saved model to disk at ../data/nearestneighbor_weight_regression_9.sav


## Predict weight values

In [20]:
def predict_weight (model_filename=filename, test_file=TEST_FILE):
    model = load_model(model_filename)
    req_df = pd.read_csv(test_file)
    X_req, weight_req_true = preprocess(req_df, drop_weight_na=False)
    print(f"X_req.shape: {X_req.shape}") 
    weight_req_pred = model.predict(X_req)
    
    print("True weights: \n", weight_req_true)
    print("Predicted weights: ", weight_req_pred)
    return weight_req_pred              

In [21]:
weight_req_pred = predict_weight(model_filename=filename, test_file=TEST_FILE)

Loaded model from disk at ../data/nearestneighbor_weight_regression_9.sav
X_req.shape: (10, 333)
True weights: 
 0      NaN
1    0.093
2    0.182
3      NaN
4      NaN
5      NaN
6      NaN
7    0.140
8    0.017
9      NaN
Name: weight, dtype: float64
Predicted weights:  [0.54033333 1.058      0.74744444 0.66722222 0.46711111 0.36311111
 0.51733333 0.44377778 0.02166667 0.86422222]


In [15]:
weight_req_pred

array([0.54033333, 1.058     , 0.74744444, 0.66722222, 0.46711111,
       0.36311111, 0.51733333, 0.44377778, 0.02166667, 0.86422222])