# Predicting attributes of clothes using tree classification methods

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import pickle

In [2]:
# Load data
data_path = "data/unique_products_with_attributes.csv"
data = pd.read_csv(data_path)
data

Unnamed: 0,cod_modelo_color,cod_color,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,...,neck_lapel_type,woven_structure,knit_structure,heel_shape_type,length_type,sleeve_length_type,toecap_type,waist_type,closure_placement,cane_height_type
0,83_1124642,82,Female,Kids,KIDS,TRICOT,Tops,Sweaters and Cardigans,Sweater,Sweater,...,Redondo,INVALID,Punto Fino,INVALID,Standard,Larga,INVALID,INVALID,Cuello,INVALID
1,86_1215223,01,Female,Kids,KIDS,WOVEN,"Dresses, jumpsuits and Complete set",Dresses and jumpsuits,Dresses,Dress,...,Caja,Ligero,INVALID,INVALID,Largo,Corta,INVALID,INVALID,Cierre Trasero,INVALID
2,84_1167695,70,Female,Kids,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,...,Redondo,Ligero,INVALID,INVALID,Standard,Corta,INVALID,INVALID,INVALID,INVALID
3,82_1108473,01,Female,Teen,KIDS,JEANS,Bottoms,Jeans,Jeans,Jeans,...,INVALID,Medio,INVALID,INVALID,Largo,INVALID,INVALID,High Waist,INVALID,INVALID
4,83_1137778,37,Male,Adult,MAN,CIRCULAR,Tops,Sweaters and Cardigans,Sweatshirts,Sweatshirt,...,Redondo,Ligero,INVALID,INVALID,Standard,Larga,INVALID,INVALID,Sin cierre,INVALID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33438,83_1126644,99,Female,Baby,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,...,Redondo,Ligero,INVALID,INVALID,Standard,Larga,INVALID,INVALID,INVALID,INVALID
33439,84_1160958,40,Female,Baby,KIDS,TRICOT,Bottoms,Trousers & leggings,Trousers,Trousers,...,INVALID,INVALID,Punto Fino,INVALID,Standard,INVALID,INVALID,Ajustable/Goma,INVALID,INVALID
33440,82_1103726,02,Female,Newborn,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,...,Panadero,Ligero,INVALID,INVALID,Standard,Larga,INVALID,INVALID,INVALID,INVALID
33441,82_1087881,94,Female,Kids,KIDS,TRICOT,"Dresses, jumpsuits and Complete set",Dresses and jumpsuits,Dresses,Dress,...,Redondo,INVALID,Punto Medio,INVALID,Corto,Larga,INVALID,INVALID,Sin cierre,INVALID


In [3]:
# How a row looks like
data.iloc[0]

cod_modelo_color                                   83_1124642
cod_color                                                  82
des_sex                                                Female
des_age                                                  Kids
des_line                                                 KIDS
des_fabric                                             TRICOT
des_product_category                                     Tops
des_product_aggregated_family          Sweaters and Cardigans
des_product_family                                    Sweater
des_product_type                                      Sweater
des_filename                     83_1124642_17074019-82_B.jpg
des_color                                          ROSA LIGHT
silhouette_type                                         Recto
neck_lapel_type                                       Redondo
woven_structure                                       INVALID
knit_structure                                     Punto Fino
heel_sha

In [4]:
# Histogram of an attribute
data["silhouette_type"].value_counts()

silhouette_type
Recto                 12261
Slim                   5122
INVALID                4959
Regular                3414
Oversize               2045
Evase                  1995
Relaxed                 477
Jogger                  451
Skinny                  428
Culotte                 371
Acampanado/Flare        257
Palazzo                 212
5 Bolsillos             202
Wide leg                193
Slouchy                 167
Cargo                   146
Mom                     142
Tapered                 108
Acampanado/Bootcut      102
Paperbag                 91
Lápiz                    77
Ancho                    67
Chino                    45
Push Up                  25
Modern slim              20
Fino                     17
Halter                   13
Superslim                13
Loose                     9
Boyfriend                 6
Parachute                 5
Bandeau                   1
Sarouel                   1
Carrot                    1
Name: count, dtype: int64

In [5]:
# Features
features = ["des_sex", "des_age", "des_line", "des_fabric", "des_product_family", "des_product_type"]
# Variables to be predicted
unique_attributes = ["silhouette_type", "neck_lapel_type", "woven_structure",
                     "knit_structure", "heel_shape_type", "length_type",
                     "sleeve_length_type", "toecap_type", "waist_type",
                     "closure_placement", "cane_height_type"]

**To treat the feratures as categorical variables, the labels have to be mapped to numbers**

In [6]:
mappings = {}

for col in features + unique_attributes:
    original_data = data[col]
    encoded_data = data[col].astype("category").cat.codes

    # Rewrite the values
    data[col] = encoded_data

    # Save the label-to-number correspondance
    df_i = pd.DataFrame({
        'original': original_data.value_counts().index,
        'encoding': encoded_data.value_counts().index
        })
    print("\n",col,"\n",df_i)
    mappings[col] = df_i
mappings

# Save the mappings
with open(f"pretrained_models/mappings.pkl","wb") as f:
        pickle.dump(mappings,f)


 des_sex 
   original  encoding
0   Female         0
1     Male         1
2   Unisex         2

 des_age 
   original  encoding
0    Adult         0
1     Kids         2
2     Baby         1
3     Teen         4
4  Newborn         3

 des_line 
   original  encoding
0    WOMAN         2
1     KIDS         0
2      MAN         1

 des_fabric 
             original  encoding
0              WOVEN         6
1           CIRCULAR         1
2             TRICOT         5
3              JEANS         2
4        ACCESSORIES         0
5  SYNTHETIC LEATHER         4
6            LEATHER         3

 des_product_family 
                     original  encoding
0                    T-shirt        21
1                    Dresses         4
2                      Shirt        16
3                   Trousers        24
4                    Sweater        19
5                      Jeans         8
6                Sweatshirts        20
7                     Shorts        17
8                   Footwear    

In [7]:
# Split test and train
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

**Train 11 classification models for the 11 attibutes to be predicted**

In [8]:
models = {} # Models will be saved here
total_accuracy = 0

# Train a model for each attribute
for attribute in unique_attributes:
    print(f"Training model for: {attribute}")

    # Separate X and y
    X_train = train_data[features]
    y_train = train_data[attribute]
    X_test = test_data[features]
    y_test = test_data[attribute]
    
    # Train
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Predict
    y_pred = rf.predict(X_test)
   
    # Test accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"...accuracy score for {attribute}: {accuracy:.4f}")
    total_accuracy += accuracy/11 # each attribute accounts for 1/11 th of the predicted values

    # Save model
    models[attribute] = rf
    # Export model
    with open(f"pretrained_models/model_{attribute}.pkl","wb") as f:
        pickle.dump(rf,f)
    
    # Map back to labels: y_pred has to be mapped back to the original labels
    mapping_df = mappings[attribute]
    encoding_to_label = dict(zip(mapping_df["encoding"], mapping_df["original"]))
    y_pred_labeled = [encoding_to_label[val] for val in y_pred]
    test_data[f"{attribute}_predicted"] = y_pred_labeled

# Total accuracy considering all attributes
print(f"\nTotal accuracy: {total_accuracy:.4f}")

Training model for: silhouette_type
...accuracy score for silhouette_type: 0.5460
Training model for: neck_lapel_type
...accuracy score for neck_lapel_type: 0.6336
Training model for: woven_structure
...accuracy score for woven_structure: 0.7164
Training model for: knit_structure
...accuracy score for knit_structure: 0.9058
Training model for: heel_shape_type
...accuracy score for heel_shape_type: 0.9843
Training model for: length_type
...accuracy score for length_type: 0.6503
Training model for: sleeve_length_type
...accuracy score for sleeve_length_type: 0.7799
Training model for: toecap_type
...accuracy score for toecap_type: 0.9842
Training model for: waist_type
...accuracy score for waist_type: 0.8855
Training model for: closure_placement
...accuracy score for closure_placement: 0.7964
Training model for: cane_height_type
...accuracy score for cane_height_type: 0.9930

Total accuracy: 0.8068


It is clear that `y_pred` and `y_test` don't completely match:

In [9]:
mappings["woven_structure"]

Unnamed: 0,original,encoding
0,Ligero,2
1,INVALID,1
2,Medio,3
3,Pesado,4
4,Elástico,0


In [10]:
test_data["woven_structure"].value_counts()

woven_structure
2    2615
1    2360
3    1477
4     229
0       8
Name: count, dtype: int64

In [11]:
test_data["woven_structure_predicted"].value_counts()

woven_structure_predicted
Ligero     3643
Medio      1541
INVALID    1365
Pesado      140
Name: count, dtype: int64

# Inference

The Kaggle score corresponds to the prediction accuracy of the clothes of `data/test_data.csv`

In [12]:
data_path = "data/test_data.csv"
real_test_df = pd.read_csv(data_path)
real_test_df.head()

Unnamed: 0,cod_modelo_color,des_filename,cod_color,des_color,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,attribute_name,test_id
0,88_49711373,88_49711373_67080432-99_.jpg,99,NEGRO,Female,Adult,WOMAN,ACCESSORIES,"Accesories, Swim and Intimate",Accessories,Footwear,Boots,cane_height_type,88_49711373_cane_height_type
1,88_49718802,88_49718802_67030656-99_.jpg,99,NEGRO,Male,Adult,MAN,ACCESSORIES,"Accesories, Swim and Intimate",Accessories,Footwear,Ankle Boots,cane_height_type,88_49718802_cane_height_type
2,88_49709572,88_49709572_67030418-01_B.jpg,1,BLANCO,Female,Kids,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,cane_height_type,88_49709572_cane_height_type
3,88_49722701,88_49722701_67066002-02_.jpg,2,OFFWHITE,Female,Baby,KIDS,CIRCULAR,Tops,T-shirts,T-shirt,T-Shirt,cane_height_type,88_49722701_cane_height_type
4,88_49724926,88_49724926_67056330-02_B.jpg,2,OFFWHITE,Male,Newborn,KIDS,WOVEN,Tops,Shirts,Shirt,Shirt,cane_height_type,88_49724926_cane_height_type


**First, the categorical variables have to be mapped to their corresponding values**

In [13]:
def map_to_encoding(mappings,value):
    if value in mappings:
        return mappings[value]
    else:
        print(f"{value} not found")
        return 999

In [14]:
# Encode each of the features using mappings
for feature in features:
    mapping_df = mappings[feature]
    encoding_to_label = dict(zip(mapping_df["original"],mapping_df["encoding"])) 
    # Rewrite the label with the number
    real_test_df[feature] = real_test_df[feature].apply(lambda x: map_to_encoding(encoding_to_label,x))

Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young not found
Young no

Note that `test_data.csv` contains some features that didn't appear in the training

In [15]:
# Create an empty prediction column
real_test_df["des_value"] = ["" for _ in range(len(real_test_df))]

**Use the 11 mdoels to predict each of the attributes**

In [16]:
for attribute in unique_attributes:
    
    # Features
    x = real_test_df[features]
    
    # Rows corresponding the current attribute 
    indexes = real_test_df["attribute_name"]==attribute
    x = x[indexes]
    
    # Predict
    rf = models[attribute]
    y = rf.predict(x)

    # Map back to labels
    mapping_df = mappings[attribute]
    encoding_to_label = dict(zip(mapping_df["encoding"], mapping_df["original"]))
    y_pred_labeled = [encoding_to_label[val] for val in y]

    # Fill the corresponding rows with the label
    real_test_df.loc[indexes,"des_value"] = y_pred_labeled

In [17]:
real_test_df

Unnamed: 0,cod_modelo_color,des_filename,cod_color,des_color,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,attribute_name,test_id,des_value
0,88_49711373,88_49711373_67080432-99_.jpg,99,NEGRO,0,0,2,0,"Accesories, Swim and Intimate",Accessories,5,7,cane_height_type,88_49711373_cane_height_type,Bloque
1,88_49718802,88_49718802_67030656-99_.jpg,99,NEGRO,1,0,1,0,"Accesories, Swim and Intimate",Accessories,5,0,cane_height_type,88_49718802_cane_height_type,Cuña
2,88_49709572,88_49709572_67030418-01_B.jpg,01,BLANCO,0,2,0,1,Tops,T-shirts,21,41,cane_height_type,88_49709572_cane_height_type,INVALID
3,88_49722701,88_49722701_67066002-02_.jpg,02,OFFWHITE,0,1,0,1,Tops,T-shirts,21,41,cane_height_type,88_49722701_cane_height_type,INVALID
4,88_49724926,88_49724926_67056330-02_B.jpg,02,OFFWHITE,1,3,0,6,Tops,Shirts,16,33,cane_height_type,88_49724926_cane_height_type,INVALID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71814,88_49727540,88_49727540_67069223-56_.jpg,56,NAVY,1,0,1,6,Tops,Shirts,16,33,knit_structure,88_49727540_knit_structure,INVALID
71815,88_49733648,88_49733648_67017145-56_.jpg,56,NAVY,0,0,2,1,Tops,T-shirts,14,28,knit_structure,88_49733648_knit_structure,INVALID
71816,88_49735572,88_49735572_67076755-81_.jpg,81,ROSA PASTEL,0,0,2,1,Tops,T-shirts,21,41,knit_structure,88_49735572_knit_structure,INVALID
71817,88_49713624,88_49713624_67092528-70_.jpg,70,ROJO,0,0,2,6,"Dresses, jumpsuits and Complete set",Dresses and jumpsuits,4,13,knit_structure,88_49713624_knit_structure,INVALID


In [18]:
# Histogram of the predictions
real_test_df["des_value"].value_counts()

des_value
INVALID                             41018
Recto                                3347
Standard                             3331
Ligero                               3295
Larga                                2564
                                    ...  
Cierre Hombro                           4
Plataforma en la parte delantera        4
Kimono                                  4
Peak Lapel                              3
Tobillero                               2
Name: count, Length: 67, dtype: int64

In [19]:
# Export the csv
real_test_df[["test_id","des_value"]].to_csv("data/predictions.csv",index=False)