In [3]:
import numpy as np
import pandas as pd
import cv2
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from tqdm import tqdm

In [6]:
# Assuming train_data is a pandas DataFrame with columns 'des_filename' and 'label'
img_dir = './images/images/'
train_data = pd.read_csv('./data/train_data.csv')

In [7]:
train_data.head()

Unnamed: 0,cod_modelo_color,silhouette_type,closure_placement,heel_shape_type,knit_structure,length_type,neck_lapel_type,cane_height_type,sleeve_length_type,toecap_type,waist_type,woven_structure,type,des_filename
0,85_1202950,Slim,-1,-1,Punto medio,Crop,Redondo,-1,Tirante Fino,-1,-1,-1,Top,85_1202950_37036315-99_B.jpg
1,85_1202950,Slim,-1,-1,Punto medio,Crop,Redondo,-1,Tirante Fino,-1,-1,-1,Top,85_1202950_37036315-99_.jpg
2,86_1217677,Oversize,-1,-1,-1,Standard,Redondo,-1,Corta,-1,-1,Ligero,Top,86_1217677_47024408-95_.jpg
3,86_1217677,Oversize,-1,-1,-1,Standard,Redondo,-1,Corta,-1,-1,Ligero,Top,86_1217677_47024408-95_B.jpg
4,84_1168477,Slim,Sin cierre,-1,Punto Fino,Standard,Barca,-1,Larga,-1,-1,-1,Top,84_1168477_27075766-99_B.jpg


In [8]:
train_data_new = train_data[train_data['type']=='Hats']
train_data = train_data_new[['des_filename','silhouette_type']]

def load_images_and_labels(images_path, train_data):
    images = []
    labels = []

    image_paths = train_data['des_filename'].map(lambda x: os.path.join(images_path, x)) 
    
    for image_path, label in tqdm(zip(image_paths, train_data['silhouette_type']), total=len(train_data), desc="Loading Images"):
        if os.path.exists(image_path):
            img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (64, 64))
            img_flat = img.flatten()
            images.append(img_flat)
            labels.append(label)

    return np.array(images), np.array(labels)

In [9]:
X, y = load_images_and_labels(img_dir, train_data)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(random_state=42)


param_grid = {
    'n_estimators': [100, 150, 200, 250],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40],   # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False],            # Whether bootstrap samples are used when building trees
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'criterion': ['gini', 'entropy'],      # Function to measure the quality of a split
    'class_weight': [None, 'balanced'],     # Weights associated with classes
    'max_depth': [None, 10, 20, 30],   # Profundidad máxima de los árboles
    'min_samples_split': [2, 5, 10],   # Mínimo número de muestras para dividir un nodo
    'min_samples_leaf': [1, 2, 4],     # Mínimo número de muestras por nodo hoja
    'bootstrap': [True, False],        # Usar muestreo con reemplazo o no
    'max_features': ['auto', 'sqrt']  # Número máximo de características a considerar
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5,
                           n_jobs=-1,
                           verbose=2,
                           scoring='accuracy')


print("Fitting model with GridSearchCV...")
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best Hyperparameters: {best_params}")

y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Test Set Accuracy with Best Model: {accuracy * 100:.2f}%")


Loading Images: 100%|██████████| 162/162 [00:00<00:00, 196.75it/s]


Fitting model with GridSearchCV...
Fitting 5 folds for each of 2304 candidates, totalling 11520 fits


  warn(


Best Hyperparameters: {'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 250}
Test Set Accuracy with Best Model: 84.85%


In [12]:
import joblib

# Save the model to a file
model_filename = './models/hats.joblib'
joblib.dump(best_model, model_filename)

print(f"Model saved to {model_filename}")

Model saved to ./models/hats.joblib
