## Import Python libraries

In [1]:
# import basic libraries
import numpy as np
import pandas as pd
import os

In [2]:
# import libraries for modelling
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Load data

### Load and prepare original source data

In [3]:
# Local data directory
path = './tdata/'
!ls  tdata

textile-v1.0.0-1.csv textile-v1.0.0-3.csv textile-v1.0.0-5.csv
textile-v1.0.0-2.csv textile-v1.0.0-4.csv


In [4]:
def load_source_data(path, weight = 'd'):
    """
    Read and concatenate the source data from the csv files to a pandas dataframe in local folder.
    The weight parameter has the following values: 
        - 'd' (default), drops the weight feature (column) and does not use it in training,
        - 'm', drops the samples (rows) from training where the weight feature is missing
        - 'a', uses also the samples (rows) where weigth feature is missing
    """
    print('Starting to open data from csv-files')
    content = sorted(filter(lambda x: x.endswith(".csv"), os.listdir(path)))
    print('Data in content, starting to concatenate data')
    X = pd.concat((pd.read_csv(f) for f in content))
    print('Data loaded to pandas dataframe')
    
    # Drop empty columns
    X = X.drop(['label','unspsc_code'],axis=1)
    print('Empty columns dropped')

    X = X[~X["co2_total"].isna()]
    print('Rows with no c02_total value dropped')
    
    if weight == 'd':
        X = X.drop(['weight'], axis = 1)
        print('Weight column dropped from training data')
        print('Shape of X =', X.shape)
    elif weight == 'm':
        X = X[~X["weight"].isna()]
        print('\n Rows with weight value but samples without weight dropped')
        print('Shape of X =', X.shape)
    elif weight == 'a':
        print('\n Rows with weight and all rows but missing weght values set to zero')
        print('Shape of X =', X.shape)
    else:
        print("Error: Wrong weight value given. Possible values 'd', 'e' and 'a'")
    
    print('')
    
    print('dataframe ready')
    y = X['co2_total'].copy()
    X = X.drop('co2_total', axis=1)

    
    return X, y

In [5]:
def preprocess(X):
    # Drop empty features (dataset v. 1.0.0): unspsc_code, label 
    print('Start preprocessing data')
    
    # Set missing fiber type percentages to zero
    values ={'ftp_acrylic': 0, 'ftp_cotton': 0, 'ftp_elastane': 0, 'ftp_linen': 0, 'ftp_other': 0, 'ftp_polyamide': 0, 'ftp_polyester': 0, 'ftp_polypropylene': 0, 'ftp_silk': 0, 'ftp_viscose': 0, 'ftp_wool': 0}
    X = X.fillna(value=values)
    print('Null fiber percentages changed to zero')
    
    # Fill categorical nan values for gender and season features with mode values. May need to be updated with new training data
    X['gender'] = X.fillna(X['gender'].value_counts().index[0])
    X['season'] = X.fillna(X['season'].value_counts().index[0])
    print('Categorial values with null replaced with mode values')
    
    # Convert the categoricals into a one-hot vector of dummy binary variables
    X = pd.get_dummies(X,columns=['category-1', 'category-2', 'category-3', 'brand', 'colour', 'fabric_type', 'gender', 'season','made_in','size'], prefix = ['category-1', 'category-2', 'category-3', 'brand', 'colour', 'fabric_type',  'gender', 'season','made_in','size'])
    print('Categorial values changed to dummy one-hot vectors')
    
    # If still some null values, change them to zero. At least the weight feature (column) has many null values. 
    X = X.fillna(0)
    print('Rest of the null values set to zero')
    
    return X

In [6]:
def train_linear(path, test_size=0.2, weight = 'd'):
    print('Start training linear')
    X, y = load_source_data(path, weight=weight)
    print('Data loaded')
    X = preprocess(X)
    print('Data preprocessed')
    X = X.to_numpy(dtype='float32')
    y = y.to_numpy(dtype='float32')
    print('Formatted to numpy')
    
    # Split training data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    print('Split to testing data')

    # Initialize and train linear model
    model_lr = LinearRegression(fit_intercept=True)
    print('Model initialized')
    model_lr.fit(X_train, y_train)
    print('Model trained')
    
    # Make predictions based on the model
    y_fit = model_lr.predict(X_test)
    print('Make predictions')
    
    # Evaluate model
    rmse_score = mean_squared_error(y_test, y_fit, squared=False)
    R2_score = r2_score(y_test, y_fit)
    print('Model evaluated')


    return model_lr, rmse_score, R2_score

### Define constants

In [7]:
# Local data directory
path = './tdata/'
!ls  tdata

textile-v1.0.0-1.csv textile-v1.0.0-3.csv textile-v1.0.0-5.csv
textile-v1.0.0-2.csv textile-v1.0.0-4.csv


### Try functions

In [8]:
#X, y = load_source_data(path)

In [9]:
#Xp = preprocess(X)

In [10]:
#list(X.columns)

### Train linear model

#### Train model without weight feature

In [11]:
model_lr, rmse_score, r2_score = train_linear(path, weight='d')

Start training linear
Starting to open data from csv-files
Data in content, starting to concatenate data
Data loaded to pandas dataframe
Empty columns dropped
Rows with no c02_total value dropped
Weight column dropped from training data
Shape of X = (1699515, 22)

dataframe ready
Data loaded
Start preprocessing data
Null fiber percentages changed to zero
Categorial values with null replaced with mode values
Categorial values changed to dummy one-hot vectors
Rest of the null values set to zero
Data preprocessed
Formatted to numpy
Split to testing data
Model initialized
Model trained
Make predictions
Model evaluated


In [12]:
print('Linear model stats without weight feature:')
print('RMSE Score:', rmse_score)
print('R2 Score:', r2_score)

Linear model stats without weight feature:
RMSE Score: 16.816357
R2 Score: 0.6298162778863827


#### Train with weight feature but samples (rows) with empty weight value droppe

In [13]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
model_lr, rmse_score_nw, r2_score_nw = train_linear(path, weight='m')

Start training linear
Starting to open data from csv-files
Data in content, starting to concatenate data
Data loaded to pandas dataframe
Empty columns dropped
Rows with no c02_total value dropped

 Rows with weight value but samples without weight dropped
Shape of X = (680256, 23)

dataframe ready
Data loaded
Start preprocessing data
Null fiber percentages changed to zero
Categorial values with null replaced with mode values
Categorial values changed to dummy one-hot vectors
Rest of the null values set to zero
Data preprocessed
Formatted to numpy
Split to testing data
Model initialized
Model trained
Make predictions
Model evaluated


In [14]:
print('Linear model stats with weight feature and samples (rows) missing weight feature dropped:')
print('RMSE Score:', rmse_score_nw)
print('R2 Score:', r2_score_nw)

Linear model stats with weight feature and samples (rows) missing weight feature dropped:
RMSE Score: 9.889889
R2 Score: 0.873564499590352


#### Train with weight feature and samples (rows) with empty weight value set to zero

In [15]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
model_lr, rmse_score_a, r2_score_a = train_linear(path, weight='a')

Start training linear
Starting to open data from csv-files
Data in content, starting to concatenate data
Data loaded to pandas dataframe
Empty columns dropped
Rows with no c02_total value dropped

 Rows with weight and all rows but missing weght values set to zero
Shape of X = (1699515, 23)

dataframe ready
Data loaded
Start preprocessing data
Null fiber percentages changed to zero
Categorial values with null replaced with mode values
Categorial values changed to dummy one-hot vectors
Rest of the null values set to zero
Data preprocessed
Formatted to numpy
Split to testing data
Model initialized
Model trained
Make predictions
Model evaluated


In [16]:
print('Linear model stats with weight feature and missing weight values set to zero')
print('RMSE Score:', rmse_score_a)
print('R2 Score:', r2_score_a)

Linear model stats with weight feature and missing weight values set to zero
RMSE Score: 16.178186
R2 Score: 0.6573795656653727


***
***