# Weight imputation

## Load and prepare data using pandas groupings for weight imputation

In [32]:
# import basic libraries
import numpy as np
import pandas as pd
import os
from tqdm import tqdm  # for following progress
from pandas.api.types import CategoricalDtype

In [33]:
# Local data directory
path = './tdata/'
!ls  tdata

textile-v1.0.0-1.csv textile-v1.0.0-3.csv textile-v1.0.0-5.csv
textile-v1.0.0-2.csv textile-v1.0.0-4.csv


In [34]:
def load_source_data(path):
    """
    Read and concatenate the source data from the csv files to a pandas dataframe in local folder
    """
    print('Starting to open data from csv-files')
    content = sorted(filter(lambda x: x.endswith(".csv"), os.listdir(path)))
    print('Data in content, starting to concatenate data')
    df = pd.concat((pd.read_csv(f) for f in content))
    print('Data loaded to pandas dataframe')

    df = df[~df['weight'].isna()]
    print('Rows with no weight value dropped')

    
    return df

In [48]:
def prepare_mean_groups(df):
    """
    Create different kind of dataframe groupings for estimating weights with the mean values of the groups
    """
    
    #****** Groups for all categorical features size, fabric type or both missing ****************#
    # Mean for groups with all categorical features
    togroup = ['category-1', 'category-2', 'category-3', 'fabric_type', 'gender', 'season', 'size']
    w_groups = df.groupby(togroup, dropna = False)["weight"].mean().reset_index()
    #print(w_groups.sample(2))

    # .. without size feature
    w_groups_nsi = df.groupby(['category-1', 'category-2', 'category-3', 'fabric_type', 'gender', 'season'], dropna = False)["weight"].mean().reset_index()
    w_groups_nsi.insert(6,'size',np.nan)
    #print(w_groups_nsi.sample(2))
    
    # .. without fabric type feature
    w_groups_nft = df.groupby(['category-1', 'category-2', 'category-3', 'gender', 'season', 'size'], dropna = False)["weight"].mean().reset_index()
    w_groups_nft.insert(3,'fabric_type',np.nan)
    #print(w_groups_nft.sample())
    
    # ...without fabric type and size
    w_groups_nftsi = df.groupby(['category-1', 'category-2', 'category-3', 'gender', 'season'], dropna = False)["weight"].mean().reset_index()
    w_groups_nftsi.insert(3,'fabric_type',np.nan)
    w_groups_nftsi.insert(6,'size',np.nan)
    #print(w_groups_nftsi.sample(2))
    
    # .. without season feature
    w_groups_nse = df.groupby(['category-1', 'category-2', 'category-3', 'fabric_type', 'gender', 'size'], dropna = False)["weight"].mean().reset_index()
    w_groups_nse.insert(5,'season',np.nan)
    #print(w_groups_nse.sample(2))
    
    # .. without season and fabric type
    w_groups_nseft = df.groupby(['category-1', 'category-2', 'category-3', 'gender', 'size'], dropna = False)["weight"].mean().reset_index()
    w_groups_nseft.insert(3,'fabric_type',np.nan)
    w_groups_nseft.insert(5,'season',np.nan)
    #print(w_groups_nseft.sample(2))
    
    
    # Concatenate dataframes with all categories and with certain attribute missing 
    w_groups_a = pd.concat([w_groups, w_groups_nsi, w_groups_nft, w_groups_nftsi, w_groups_nse, w_groups_nseft])
    print(w_groups_a.sample(5))
    #*********************************************************************************************#
    
    
    
    
    #*****************    Groups for category-1, -2, -3 and size      ****************************#
    # With all the features present
    w_groups_cat123si = df.groupby(['category-1','category-2','category-3','size'], dropna = False)["weight"].mean().reset_index()
    print(w_groups_cat123si.sample(2))
    
    # With size missing
    w_groups_cat123 = df.groupby(['category-1','category-2','category-3'], dropna = False)["weight"].mean().reset_index()
    w_groups_cat123.insert(3,'size',np.nan)
    print(w_groups_cat123.sample(2))
    
    # With category-3 missing
    w_groups_cat12si = df.groupby(['category-2','category-3','size'], dropna = False)["weight"].mean().reset_index()
    w_groups_cat12si.insert(0,'category-1',np.nan)
    print(w_groups_cat12si.sample(2))
    
    # With category-2 missing
    w_groups_cat13si = df.groupby(['category-1','category-3','size'], dropna = False)["weight"].mean().reset_index()
    w_groups_cat13si.insert(1,'category-2',np.nan)
    print(w_groups_cat13si.sample(2))
    
    # With category-1 missing
    w_groups_cat23si = df.groupby(['category-2','category-3','size'], dropna = False)["weight"].mean().reset_index()
    w_groups_cat23si.insert(0,'category-1',np.nan)
    print(w_groups_cat23si.sample(2))
    
    # With category-1 and -2 missing
    w_groups_cat3si = df.groupby(['category-3','size'], dropna = False)["weight"].mean().reset_index()
    w_groups_cat3si.insert(0,'category-1',np.nan)
    w_groups_cat3si.insert(1,'category-2',np.nan)
    print(w_groups_cat3si.sample(2))
    
    # With category-1 and -3 missing
    w_groups_cat2si = df.groupby(['category-2','size'], dropna = False)["weight"].mean().reset_index()
    w_groups_cat2si.insert(0,'category-1',np.nan)
    w_groups_cat2si.insert(2,'category-3',np.nan)
    print(w_groups_cat2si.sample(2))
    
    # With category-2 and -3 missing
    w_groups_cat1si = df.groupby(['category-1','size'], dropna = False)["weight"].mean().reset_index()
    w_groups_cat1si.insert(1,'category-2',np.nan)
    w_groups_cat1si.insert(2,'category-3',np.nan)
    print(w_groups_cat1si.sample(2))
    
    # With category-2, -3 and size missing
    w_groups_cat1 = df.groupby(['category-1'], dropna = False)["weight"].mean().reset_index()
    w_groups_cat1.insert(1,'category-2',np.nan)
    w_groups_cat1.insert(2,'category-3',np.nan)
    w_groups_cat1.insert(3,'size',np.nan)
    print(w_groups_cat1.sample(2))
    
    # With category-1, -3 and size missing
    w_groups_cat2 = df.groupby(['category-2'], dropna = False)["weight"].mean().reset_index()
    w_groups_cat2.insert(0,'category-1',np.nan)
    w_groups_cat2.insert(2,'category-3',np.nan)
    w_groups_cat2.insert(3,'size',np.nan)
    print(w_groups_cat2.sample(2))
    
    # With category-1, -2 and size missing
    w_groups_cat3 = df.groupby(['category-3'], dropna = False)["weight"].mean().reset_index()
    w_groups_cat3.insert(0,'category-1',np.nan)
    w_groups_cat3.insert(1,'category-2',np.nan)
    w_groups_cat3.insert(3,'size',np.nan)
    print(w_groups_cat3.sample(2))
    
    # With category-1, -2 and -3 missing
    w_groups_si = df.groupby(['size'], dropna = False)["weight"].mean().reset_index()
    w_groups_si.insert(0,'category-1',np.nan)
    w_groups_si.insert(1,'category-2',np.nan)
    w_groups_si.insert(2,'category-3',np.nan)
    print(w_groups_si.sample(2))
    
    # Concatenate dataframes with category1, -2, -3 with a variety of missing values
    l_wgroups = [w_groups_cat123si,w_groups_cat123,w_groups_cat12si,w_groups_cat13si,w_groups_cat23si,w_groups_cat1si,
             w_groups_cat2si, w_groups_cat3si, w_groups_cat1,w_groups_cat2,w_groups_cat3,w_groups_si]
    wg_cat123sia = pd.concat(l_wgroups)
    print(wg_cat123sia.sample(5))
    
    return w_groups_a, wg_cat123sia

In [49]:
def find_mean_weight(w_groups_a, wg_cat123sia, cat1= None, cat2 = None, cat3 = None, ft = None, g = None, se = None, si = None):
    """
     Find based on categorical values mean weight from the given dataframes with respective mean values
    """
    value = -5
    
    value = w_groups_a[((w_groups_a['category-1']==cat1) | (w_groups_a['category-1'].isnull())) &
                     ((w_groups_a['category-2']==cat2) | (w_groups_a['category-2'].isnull())) &
                     ((w_groups_a['category-3']==cat3) | (w_groups_a['category-3'].isnull())) &
                     ((w_groups_a['fabric_type']==ft) | (w_groups_a['fabric_type'].isnull())) & 
                     ((w_groups_a['gender'] == g) | (w_groups_a['gender'].isnull()))  & 
                     ((w_groups_a['season'] == se) | (w_groups_a['season'].isnull())) & 
                     ((w_groups_a['size']==si) | (w_groups_a['size'].isnull()))]['weight']
    if len(value) == 0:
        value = wg_cat123sia[((wg_cat123sia['category-1']==cat1) | (wg_cat123sia['category-1'].isnull())) &
                     ((wg_cat123sia['category-2']==cat2) | (wg_cat123sia['category-2'].isnull())) &
                     ((wg_cat123sia['category-3']==cat3) | (wg_cat123sia['category-3'].isnull())) &
                     ((wg_cat123sia['size']==si) | (wg_cat123sia['size'].isnull()))]['weight']
    if len(value) == 0:
        value = -1
    else:
        value= value.to_numpy()[0]
    return float(value)

In [50]:
# Local data directory
path = './tdata/'
!ls  tdata

textile-v1.0.0-1.csv textile-v1.0.0-3.csv textile-v1.0.0-5.csv
textile-v1.0.0-2.csv textile-v1.0.0-4.csv


In [51]:
df = load_source_data(path)

Starting to open data from csv-files
Data in content, starting to concatenate data
Data loaded to pandas dataframe
Rows with no weight value dropped


In [52]:
df1, df2 = prepare_mean_groups(df)

       category-1 category-2 category-3 fabric_type gender season size  \
9449   womenswear   clothing    T-shirt         NaN      W    NaN  XXL   
4439     clothing       suit   trousers         NaN      K    WIN  XXL   
11494    kidswear  outerwear      coats           K      Y    SUM   XS   
1701     kidswear   clothing   overalls           K      B    MID  NaN   
8108     kidswear   thermals   knitwear         NaN      B    WIN    S   

         weight  
9449   0.271415  
4439   1.612563  
11494  0.928373  
1701   0.338857  
8108   0.804363  
     category-1 category-2 category-3 size    weight
1020   menswear    costume     jacket    L  1.113863
303    clothing  outerwear     gloves   XL  0.180292
   category-1  category-2 category-3  size    weight
28   clothing     costume      coats   NaN  2.077326
55   clothing  sportswear      coats   NaN  2.068762
     category-1 category-2 category-3 size    weight
57          NaN   clothing     jacket   XL  1.047681
217         NaN       h

In [53]:
cat1 = 'clothing'
cat2 = 'costume'
cat3 = 'trousers'
si = 'L'

In [54]:
find_mean_weight(df1, df2, cat2=cat2, cat3 = cat3, si = si)

0.9066842735656965

In [55]:
find_mean_weight(df1, df2, cat3 = cat3)

0.8753975013627198

In [56]:
find_mean_weight(df1, df2, si = si)

0.5394330362988069

In [None]:
w_groups_si = df.groupby(['size'], dropna = False)["weight"].mean().reset_index()
w_groups_si.insert(0,'category-1',np.nan)
w_groups_si.insert(1,'category-2',np.nan)
w_groups_si.insert(2,'category-3',np.nan)
w_groups_si.sample(2)

In [None]:
w_groups_cat3si = df.groupby(['category-3','size'], dropna = False)["weight"].mean().reset_index()
w_groups_cat3si.insert(0,'category-1',np.nan)
w_groups_cat3si.insert(1,'category-2',np.nan)
w_groups_cat3si.sample(2)

In [None]:
l_wgroups = [w_groups_cat123si,w_groups_cat123,w_groups_cat12si,w_groups_cat13si,w_groups_cat23si,w_groups_cat1si,
             w_groups_cat2si, w_groups_cat3si, w_groups_cat1,w_groups_cat2,w_groups_cat3,w_groups_si]

In [None]:
wg_cat123sia = pd.concat(l_wgroups)