In [6]:
import pandas as pd
import numpy as np

from pandas.api.types import CategoricalDtype
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import joblib
import os

In [2]:
#base_dir = '../data' # Need to have the data as CSV files in this path
base_dir = '../data/mini' # Need to have the data as CSV files in this path

In [3]:
def load_source_data(path):
    content = sorted(filter(lambda x: x.endswith(".csv"), os.listdir(path)))
    X = pd.concat((pd.read_csv(f'{path}/{f}') for f in content))
    X = X[~X['co2_total'].isna()]

    y = X['co2_total'].copy()
    X = X.drop('co2_total', axis=1)

    return X, y

In [27]:
class KNearestNeighborsLarge:
    """
    K-nearest neighbors clustering regression model. 
    
    The class has two important variables: 
    
    n_neighbors: the number of nearest neigbors to use (default 9) for the algorithm. Set using set_n_neighbors(k).
    
    training_samples: the number of samples (from beginning of dataframe) to use for training (default 100 000). Set using set_training_samples(samples).
    """
    def __init__(self):
        self.n_neighbors = 5
        self.training_samples = 200000
        self.__set_filename()
        self.model = None
    
    def __set_filename(self):
        self.filename = f"k-nearestneighbors-large_k_{self.n_neighbors}_training_samples_{self.training_samples}.model"

    #def preprocess(self, X):
    def __preprocess(self, X):
        # Drop empty features (dataset v. 1.0.0): unspsc_code, label 
        X = X.drop(["label", "unspsc_code"], axis=1)
        
        # Drop features that will not be used for this model
        X = X.drop(["brand", "colour"], axis=1)

        #ftp_acrylic,ftp_cotton,ftp_elastane,ftp_linen,ftp_other,ftp_polyamide,ftp_polyester,
        #ftp_polypropylene,ftp_silk,ftp_viscose,ftp_wool,weight
         
        # Use unordered caterogies for several columns. List category values to support use cases when some
        # values are absent from a batch of source data.
        #brand_types = CategoricalDtype(categories=["b0", "b1", "b10", "b100", "b101", "b102", "b103", "b104", "b105", "b106", "b107", "b108", "b109", "b11", "b110", "b111", "b112", "b113", "b114", "b115", "b116", "b117", "b118", "b119", "b12", "b120", "b121", "b122", "b123", "b124", "b125", "b126", "b127", "b128", "b129", "b13", "b130", "b131", "b132", "b133", "b134", "b135", "b136", "b137", "b138", "b139", "b14", "b140", "b141", "b142", "b143", "b144", "b145", "b146", "b147", "b148", "b149", "b15", "b16", "b17", "b18", "b19", "b2", "b20", "b21", "b22", "b23", "b24", "b25", "b26", "b27", "b28", "b29", "b3", "b30", "b31", "b32", "b33", "b34", "b35", "b36", "b37", "b38", "b39", "b4", "b40", "b41", "b42", "b43", "b44", "b45", "b46", "b47", "b48", "b49", "b5", "b50", "b51", "b52", "b53", "b54", "b55", "b56", "b57", "b58", "b59", "b6", "b60", "b61", "b62", "b63", "b64", "b65", "b66", "b67", "b68", "b69", "b7", "b70", "b71", "b72", "b73", "b74", "b75", "b76", "b77", "b78", "b79", "b8", "b80", "b81", "b82", "b83", "b84", "b85", "b86", "b87", "b88", "b89", "b9", "b90", "b91", "b92", "b93", "b94", "b95", "b96", "b97", "b98", "b99"], ordered=False)
        #X["brand"] = X["brand"].astype(brand_types)
        cat1_types =  CategoricalDtype(categories=["baby", "clothing", "home", "kidswear", "menswear", "womenswear"], ordered=False)
        X["category-1"] = X["category-1"].astype(cat1_types)
        cat2_types = CategoricalDtype(categories=["home", "footwear", "nightwear", "thermals", "outerwear", "accessory", "uniform", "suit", "swimwear", "headgear", "sportswear", "costume", "clothing", "undergarments", "baby", "dress", "beachwear", "men-undergarments", "hosiery", "women-beachwear", "women-undergarments", "women-sportswear"], ordered=False)
        X["category-2"] = X["category-2"].astype(cat2_types)
        cat3_types = CategoricalDtype(categories=["backpack", "bikin", "body", "boxer-brief", "bra", "brief", "briefs", "cap", "coats", "costume", "curtain", "dress", "evening-dress", "fancy-dress", "flat-cap", "gloves", "hat", "hoodie", "jacket", "jean-shorts", "jeans", "jersey", "knit-cap", "knitwear", "long-sleeved-top", "mat", "overalls", "panties", "pants", "pillow", "pyjama", "scarf", "sheets", "shorts", "skirts", "snow-suit", "socks", "sport-bra", "stockings", "swimsuit", "T-shirt", "tie", "tights", "top", "towel", "trousers", "underpants", "wedding-dress"], ordered=False)
        X["category-3"] = X["category-3"].astype(cat3_types)
        #colour_types = CategoricalDtype(categories=["Ivory", "amber", "aquamarine", "black", "blue", "blue gray", "bondi blue", "brown", "colourful", "dark green", "dark grey", "gold", "golden", "gray", "green", "grey", "indigo", "light brown", "light grey", "lime", "maroon", "metal", "mosaic", "mustard", "natural", "navy", "neon", "orange", "peach", "pink", "purple", "red", "silver", "teal", "turquoise", "unbleached", "unknown", "violet", "wheat", "white", "yellow"], ordered=False)
        #X["colour"] = X["colour"].astype(colour_types)
        fabric_type_types = CategoricalDtype(categories=["K", "W"], ordered=False)
        X["fabric_type"] = X["fabric_type"].astype(fabric_type_types)
        gender_types = CategoricalDtype(categories=["B", "G", "K", "M", "U", "Y", "W"], ordered=False)
        X["gender"] = X["gender"].astype(gender_types)
        made_in_types = CategoricalDtype(categories=["AU", "BD", "BE", "BG", "BR", "CN", "CO", "CY", "DE", "DK", "EG", "ES", "FI", "FR", "GB", "GE", "GR", "HK", "IE", "IN", "IT", "JP", "KR", "LT", "LV", "ML", "MX", "PK", "RO", "SE", "TH", "TR", "TW", "US", "VE", "VN"], ordered=False)
        X["made_in"] = X["made_in"].astype(made_in_types)
        season_types = CategoricalDtype(categories=["AYR", "MID", "SUM", "WIN"], ordered=False)
        X["season"] = X["season"].astype(season_types)

        # Use ordered categories for size
        size_type = CategoricalDtype(categories=["XS", "S", "M", "L", "XL", "XXL"], ordered=True)
        X["size"] = X["size"].astype(size_type)

        # Convert the categoricals into a one-hot vector of binary variables
        X = pd.get_dummies(X)
        print(X)
        
        # Fill in 0 for NA in ftp_ columns
        X = X.fillna(0)
        print(X)
        
        scaler = MinMaxScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
        print(X_scaled)
        
        return X_scaled

    def __save_model(self, base_dir):
        print(f"Saving K-nearest neighbors model to disk at {base_dir}/{self.filename}")
        joblib.dump(self.model, f"{base_dir}/{self.filename}")

    def __train(self, X, y):
        # Only use the set number of samples for training
        if (len(X.index) > self.training_samples):
            X = X[:self.training_samples]
            y = y[:self.training_samples]
        
        X = self.__preprocess(X)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = neighbors.KNeighborsRegressor(self.n_neighbors, weights='uniform')
        model.fit(X_train, y_train)

        preds = model.predict(X_test)

        s_rmse = np.sqrt(mean_squared_error(y_test, preds))
        s_r2 = r2_score(y_test, preds)
        print(f"K-nearest neighbors trained with stats RMSE = {s_rmse}, R2 = {s_r2}")

        return model, s_r2

    def set_n_neighbors(self, k):
        """
        Set the k to use in k-nearest neigbors (default k = 9)

        @param k (int): Number of neighbors to use in the k-nearest neigbors algorithm
        """
        self.n_neighbors = n
        self.__set_filename()
    
    def set_training_samples(self, samples):
        """
        Set the number of samples to use in training the k-nearest neigbors model (default samples = 500 000).
        Note that a training is very slow with very large samples sizes.

        @param samples (int): Number of samples to use in training the k-nearest neigbors algorithm
        """
        self._training_samples = samples
        self.__set_filename()

    def load(self, base_dir):
        self.model = joblib.load(f"{base_dir}/{self.filename}")

    def train(self, X, y, base_dir=None):
        print(f"Training K-nearest neighbors model with k = {self.n_neighbors} using {self.training_samples} samples")
        model, _ = self.__train(X, y)
        self.model = model
        self.__save_model(base_dir)

    def eval(self, X, y):
        print(f"Evaluating K-nearest neighbors model with k = {self.n_neighbors} using training with {self.training_samples} samples")
        _, s_r2 = self.__train(X, y)
        return s_r2

    def predict(self, X):
        X = self.__preprocess(X)
        return self.model.predict(X)

In [28]:
model = KNearestNeighborsLarge()
X, y = load_source_data(base_dir)
model.train(X, y, base_dir)

Training K-nearest neighbors model with k = 5 using 200000 samples
         ftp_acrylic  ftp_cotton  ftp_elastane  ftp_linen  ftp_other  \
23               NaN        12.0           6.0        1.0        6.0   
34               NaN         9.0           6.0        NaN        NaN   
51               NaN         NaN          10.0        NaN        NaN   
56               NaN         NaN           NaN       31.0        NaN   
74               8.0         2.0          14.0        NaN       14.0   
...              ...         ...           ...        ...        ...   
1768437          NaN         3.0           4.0        1.0        2.0   
1768443          9.0         1.0           6.0        NaN       10.0   
1768467          NaN         7.0           7.0       59.0        8.0   
1768473          NaN        59.0          17.0        NaN        NaN   
1768479          4.0         NaN           NaN        NaN        8.0   

         ftp_polyamide  ftp_polyester  ftp_polypropylene  ftp_silk  

K-nearest neighbors trained with stats RMSE = 15.546848385508877, R2 = 0.6822949615580403
Saving K-nearest neighbors model to disk at ../data/mini/k-nearestneighbors-large_k_5_training_samples_200000.model


In [16]:
model

<__main__.KNearestNeighborsLarge at 0x2c3d2f88bc8>

In [18]:
object_methods = [method_name for method_name in dir(model)
                  if callable(getattr(model, method_name))]

In [19]:
object_methods

['_KNearestNeighborsLarge__preprocess',
 '_KNearestNeighborsLarge__save_model',
 '_KNearestNeighborsLarge__set_filename',
 '_KNearestNeighborsLarge__train',
 '__class__',
 '__delattr__',
 '__dir__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'eval',
 'load',
 'predict',
 'set_n_neighbors',
 'set_training_samples',
 'train']