In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
from sklearn import neighbors
from sklearn.model_selection import train_test_split
import joblib

## Constants

In [2]:
DATA_PATH = '../data' # Need to have the data as CSV files in this path
#DATA_PATH = '../data/mini' # Need to have the data as CSV files in this path

## Scripts for loading data

In [3]:
# This is for if you need to clone the data repo and extract data csv files

# git clone https://github.com/Compensate-Operations/emission-sample-data.git

# !for i in /emission-sample-data/datasets/textile-v1.0.0/*.tgz; do tar -zxvf "$i" ;done
# !ls -lah
# !rm ._textile-v1.0.0-5.csv

In [4]:
%pwd

'C:\\Users\\mikko.kotola\\Documents\\Compensate\\carbonpredict\\notebooks'

In [5]:
content = sorted(filter(lambda x: x.endswith(".csv"), os.listdir(DATA_PATH)))

df = pd.concat((pd.read_csv(f'{DATA_PATH}/{f}') for f in content))

In [6]:
# If needed, use this line to use only top 1 000 000 rows for experimentation
df = df[:1000000]
df

Unnamed: 0,brand,category-1,category-2,category-3,co2_total,colour,fabric_type,ftp_acrylic,ftp_cotton,ftp_elastane,...,ftp_silk,ftp_viscose,ftp_wool,gender,label,made_in,season,size,unspsc_code,weight
0,b111,womenswear,uniform,jacket,,blue gray,K,,,,...,,,,W,,TR,,XS,,1.062
1,b82,home,home,curtain,,teal,W,,11.0,5.0,...,,,68.0,,,PK,,XXL,,
2,b107,menswear,headgear,knit-cap,,metal,K,3.0,,4.0,...,89.0,2.0,,M,,PK,,XL,,0.160
3,b111,home,home,curtain,,light grey,K,,,23.0,...,1.0,5.0,,,,TR,,M,,
4,b83,womenswear,footwear,socks,,bondi blue,K,21.0,,,...,,,11.0,W,,VN,,M,,0.029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,b111,baby,footwear,socks,,orange,K,,,,...,,5.0,83.0,,,CN,,XL,,0.033
999996,b85,baby,baby,body,,yellow,K,5.0,4.0,6.0,...,2.0,4.0,66.0,,,CN,,XXL,,
999997,b13,home,home,curtain,,gray,W,5.0,2.0,3.0,...,1.0,2.0,2.0,,,CN,,XL,,0.278
999998,b50,clothing,home,mat,,Ivory,W,20.0,,16.0,...,,,41.0,B,,CN,,L,,


## Preprocessing

In [7]:
# Drop empty features (dataset v. 1.0.0): unspsc_code, label 
df = df.drop(['label', 'unspsc_code'], axis=1)

# Use ordered categories for size
size_type = CategoricalDtype(categories=["XS", "S", "M", "L", "XL", "XXL"], ordered=True)
df["size"].astype(size_type)

0          XS
1         XXL
2          XL
3           M
4           M
         ... 
999995     XL
999996    XXL
999997     XL
999998      L
999999      L
Name: size, Length: 1000000, dtype: category
Categories (6, object): [XS < S < M < L < XL < XXL]

In [8]:
df

Unnamed: 0,brand,category-1,category-2,category-3,co2_total,colour,fabric_type,ftp_acrylic,ftp_cotton,ftp_elastane,...,ftp_polyester,ftp_polypropylene,ftp_silk,ftp_viscose,ftp_wool,gender,made_in,season,size,weight
0,b111,womenswear,uniform,jacket,,blue gray,K,,,,...,,,,,,W,TR,,XS,1.062
1,b82,home,home,curtain,,teal,W,,11.0,5.0,...,7.0,4.0,,,68.0,,PK,,XXL,
2,b107,menswear,headgear,knit-cap,,metal,K,3.0,,4.0,...,,,89.0,2.0,,M,PK,,XL,0.160
3,b111,home,home,curtain,,light grey,K,,,23.0,...,,8.0,1.0,5.0,,,TR,,M,
4,b83,womenswear,footwear,socks,,bondi blue,K,21.0,,,...,,24.0,,,11.0,W,VN,,M,0.029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,b111,baby,footwear,socks,,orange,K,,,,...,4.0,1.0,,5.0,83.0,,CN,,XL,0.033
999996,b85,baby,baby,body,,yellow,K,5.0,4.0,6.0,...,3.0,4.0,2.0,4.0,66.0,,CN,,XXL,
999997,b13,home,home,curtain,,gray,W,5.0,2.0,3.0,...,80.0,1.0,1.0,2.0,2.0,,CN,,XL,0.278
999998,b50,clothing,home,mat,,Ivory,W,20.0,,16.0,...,,,,,41.0,B,CN,,L,


In [9]:
df_co2 = df[~df["co2_total"].isna()]

## Train-test-split

Try first using just the subset of source data with CO2 targets present.

In [10]:
y = df_co2["co2_total"]
df_co2 = df_co2.drop(["co2_total"], axis=1)
columns_to_include = ["category-1", "category-2", "category-3", "fabric_type", "size"]
#columns_to_include = ["brand", "category-1", "category-2", "category-3", "colour", "fabric_type", "made_in", "size"]
df_co2_small = pd.DataFrame(columns=columns_to_include, data=df_co2[columns_to_include].values)

In [11]:
df_co2

Unnamed: 0,brand,category-1,category-2,category-3,colour,fabric_type,ftp_acrylic,ftp_cotton,ftp_elastane,ftp_linen,...,ftp_polyester,ftp_polypropylene,ftp_silk,ftp_viscose,ftp_wool,gender,made_in,season,size,weight
23,b101,womenswear,headgear,flat-cap,light brown,K,,12.0,6.0,1.0,...,7.0,,61.0,,,W,TR,,S,0.122
34,b131,clothing,accessory,backpack,unbleached,K,,9.0,6.0,,...,6.0,,,,69.0,W,CN,,M,
51,b76,baby,thermals,trousers,purple,K,,,10.0,,...,6.0,3.0,69.0,,6.0,,US,AYR,M,
56,b86,menswear,swimwear,swimsuit,natural,K,,,,31.0,...,45.0,8.0,,,,M,HK,,XS,
74,b131,kidswear,thermals,trousers,green,K,8.0,2.0,14.0,,...,36.0,14.0,3.0,6.0,,B,ES,MID,XXL,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999965,b112,womenswear,clothing,jacket,light brown,W,,,,1.0,...,,,,1.0,,W,DE,,XS,1.088
999966,b103,kidswear,clothing,hoodie,unbleached,K,,12.0,,6.0,...,16.0,,4.0,,43.0,K,CN,,XS,
999970,b66,womenswear,women-undergarments,panties,navy,K,2.0,,,4.0,...,75.0,2.0,5.0,4.0,1.0,W,VN,,XS,0.017
999971,b124,kidswear,clothing,overalls,lime,K,1.0,2.0,4.0,7.0,...,2.0,6.0,7.0,,,B,VN,MID,S,


In [12]:
df_co2_small

Unnamed: 0,category-1,category-2,category-3,fabric_type,size
0,womenswear,headgear,flat-cap,K,S
1,clothing,accessory,backpack,K,M
2,baby,thermals,trousers,K,M
3,menswear,swimwear,swimsuit,K,XS
4,kidswear,thermals,trousers,K,XXL
...,...,...,...,...,...
112731,womenswear,clothing,jacket,W,XS
112732,kidswear,clothing,hoodie,K,XS
112733,womenswear,women-undergarments,panties,K,XS
112734,kidswear,clothing,overalls,K,S


In [13]:
# Convert the categoricals into a one-hot vector of binary variables
df_co2_small_bin = pd.get_dummies(df_co2_small)

In [14]:
df_co2_small_bin

Unnamed: 0,category-1_baby,category-1_clothing,category-1_home,category-1_kidswear,category-1_menswear,category-1_womenswear,category-2_accessory,category-2_baby,category-2_beachwear,category-2_clothing,...,category-3_underpants,category-3_wedding-dress,fabric_type_K,fabric_type_W,size_L,size_M,size_S,size_XL,size_XS,size_XXL
0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112731,0,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
112732,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
112733,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
112734,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(df_co2_small_bin, y, test_size=0.2, random_state=42)

In [16]:
X_train[X_train.isnull().any(axis=1)]
# No nulls present

Unnamed: 0,category-1_baby,category-1_clothing,category-1_home,category-1_kidswear,category-1_menswear,category-1_womenswear,category-2_accessory,category-2_baby,category-2_beachwear,category-2_clothing,...,category-3_underpants,category-3_wedding-dress,fabric_type_K,fabric_type_W,size_L,size_M,size_S,size_XL,size_XS,size_XXL


## Clustering using k-nearest neighbors

In [17]:
n_neighbors = 10

In [None]:
nbrs = neighbors.NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto').fit(X_train)
distances, indices = nbrs.kneighbors(X_train)

In [None]:
# Save model to disk or load model from disk
filename = f"nearestneighbor_{n_neighbors}.sav"
savemodel = False
loadmodel = False
if (savemodel):
    joblib.dump(nbrs, f"{DATA_PATH}/{filename}")
if (loadmodel):
    nbrs = joblib.load(f"{DATA_PATH}/{filename}")

In [None]:
distances[0:5,]

In [None]:
indices[0:5]

## Baseline predictions using k-nearest neigbors regression

### Search for a good k using small subset

In [None]:
number_to_try = 15
losses = np.zeros((number_to_try+1), dtype=np.float64)
for n in range(2,number_to_try+1):
    knn = neighbors.KNeighborsRegressor(n, algorithm='ball_tree', weights='uniform')
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    # Use simple RMSE
    elementwise_loss = np.sqrt(np.square(y_test-y_pred))
    losses[n] = np.sum(elementwise_loss)

In [None]:
ax = sns.lineplot(x=range(number_to_try+1), y=losses).set_title("KNN, number of neighbors vs loss")

K = 9 would seem to work best. Use it for baseline.

In [18]:
n_neighbors = 9
knn = neighbors.KNeighborsRegressor(n_neighbors, weights='uniform')
knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                    weights='uniform')

In [19]:
# Save model to disk or load model from disk
filename = f"nearestneighbor_regression_{n_neighbors}.sav"
savemodel = True
loadmodel = False
if (savemodel):
    joblib.dump(knn, f"{DATA_PATH}/{filename}")
if (loadmodel):
    knn = joblib.load(f"{DATA_PATH}/{filename}")

In [21]:
y_pred = knn.predict(X_test)

In [22]:
y_pred[0:5]

array([ 0.83333333,  4.35666667,  4.64      ,  5.68444444, 19.05333333])

In [23]:
y_test[0:5]

997231     0.91
547878     4.09
636189    26.16
340993     3.04
518073    42.58
Name: co2_total, dtype: float64

In [24]:
# Use simple RMSE
elementwise_loss = np.sqrt(np.square(y_test-y_pred))
loss = np.sum(elementwise_loss)

In [27]:
loss_average = loss/len(elementwise_loss)

In [28]:
loss

206044.83777777778

In [29]:
elementwise_loss

997231     0.076667
547878     0.266667
636189    21.520000
340993     2.644444
518073    23.526667
            ...    
211533     4.676667
459275     3.850000
733234     0.796667
250578     0.462222
538812     0.392222
Name: co2_total, Length: 22548, dtype: float64

In [30]:
loss_average

9.138053830839887

## Predict CO2e values using baseline k-nearest neighbors regression (k=10)

### Prepare a set of samples to predict for

In [None]:
samples_no_co2 = df[df["co2_total"].isna()]
samples_small = pd.DataFrame(columns=columns_to_include, data=samples_no_co2[columns_to_include].values)
samples_small_bin = pd.get_dummies(samples_small)

In [None]:
samples_small_bin

In [None]:
samples_pred = knn.predict(samples_small_bin)

In [None]:
samples_pred[0:10]