In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vinted2/Vinted_Scraper_modified.csv
/kaggle/input/vestiaire-filtrado-definitivo/Filtered_Vestiaire (4).csv
/kaggle/input/vestiaire/vestiaire.csv
/kaggle/input/filteredvestisize/Filtered_Vestiaire_with_size.csv
/kaggle/input/vinted-adaptao-a-vestiaire/Vinted_Scraper_adapted.csv


In [2]:


# 1. Importar librerías
import pandas as pd
import numpy as np
import random
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import lightgbm as lgbm
import xgboost as xgb
import catboost as cb
import matplotlib.pyplot as plt



# 2. Carga y preparación de datos
data0 = pd.read_csv("/kaggle/input/filteredvestisize/Filtered_Vestiaire_with_size.csv") 
# Filtrar solo vendidos y muestrear hasta 20k (o el total disponible si es menor)
#data0 = data0[data0['sold']]


# Filtrar solo las marcas de interés
#marcas = ['Balenciaga', 'Gucci', 'Jean Paul Gaultier', 'Dolce Gabbana','Dries Van Noten', 'Acne Studios', 'Givenchy', 'Prada', 'Alyx', 'Marc Jacobs','Valentino', 'Alexande McQueen', 'Yohji Yamamoto', 'Dior', 'Mihara Yasuhiro','Louboutin', 'Miu Miu', 'Saint Laurent', 'Supreme', 'Dsquared2', 'Rick Owens','Thierry Mugler', 'Issey Miyake', 'Vivienne Westwood', 'Margiela'

#data0 = data0[data0['brand_name'].isin(marcas)]
# Guardar el dataset filtrado en un archivo CSV
#data0.to_csv('/kaggle/working/Filtered_Vestiaire.csv', index=False)




In [None]:
# Codificar categóricas
for c in data0.select_dtypes(include=['object']):
    data0[c] = data0[c].fillna('N')
    data0[c] = LabelEncoder().fit_transform(data0[c])
# Definir X e y
target = 'price_usd'
X = data0.drop(columns=[target])
y = data0[target].values

# 3. División entrenamiento/prueba (80/20)
idx = np.arange(len(X))
np.random.seed(2021)
np.random.shuffle(idx)
train_idx = idx[:int(0.8*len(idx))]
test_idx  = idx[int(0.8*len(idx)):]
X_train, y_train = X.iloc[train_idx], y[train_idx]
X_test, y_test   = X.iloc[test_idx],  y[test_idx]

# 4. Preprocesado
numeric_features = X_train.columns.tolist()
numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, numeric_features)
])

# 5. Definición de modelos
def init_models():
    return {
        'LightGBM': lgbm.LGBMRegressor(objective='rmse', learning_rate=0.1,
                                        n_estimators=10000, max_depth=5, num_leaves=62,
                                        subsample=0.9, subsample_freq=3, colsample_bytree=0.5,
                                        reg_alpha=0.1, reg_lambda=1.0, min_child_samples=10,
                                        random_state=71),
        'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.1,
                                    n_estimators=1000, max_depth=5,
                                    subsample=0.9, colsample_bytree=0.5,
                                    random_state=71),
        'CatBoost': cb.CatBoostRegressor(iterations=1000, learning_rate=0.1,
                                        depth=5, verbose=0, random_seed=71),
        'RandomForest': RandomForestRegressor(n_estimators=200, max_depth=10,
                                             random_state=71, n_jobs=-1),
        'Ridge': Ridge(alpha=1.0),
        'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=71),
        'SVR': SVR(kernel='rbf', C=1.0, gamma='scale'),
        'KNN': KNeighborsRegressor(n_neighbors=5),
        'MLP': MLPRegressor(hidden_layer_sizes=(100,50), max_iter=200, random_state=71)
    }

# 6. Métricas y validación cruzada
cv = KFold(n_splits=5, shuffle=True, random_state=71)
scorers = {
    'RMSE': make_scorer(lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False)),
    'MAE': make_scorer(mean_absolute_error),
    'R2': make_scorer(r2_score)
}
results = []
for name, model in init_models().items():
    pipe = Pipeline([('preproc', preprocessor), ('model', model)])
    print(f"Evaluando {name}…")
    scores = {}
    for metric, scorer in scorers.items():
        cv_scores = cross_val_score(pipe, X_train, y_train, cv=cv,
                                    scoring=scorer, n_jobs=-1, error_score='raise')
        scores[f'{metric} Mean'] = np.mean(cv_scores)
        scores[f'{metric} Std']  = np.std(cv_scores)
    # Tiempos
    t0 = time.perf_counter()
    pipe.fit(X_train, y_train)
    train_time = time.perf_counter() - t0
    t1 = time.perf_counter()
    pipe.predict(X_test)
    pred_time = time.perf_counter() - t1
    # Agregar al dataframe
    scores.update({'Training Time (s)': train_time,
                   'Prediction Time (s)': pred_time,
                   'Model': name})
    results.append(scores)

# 7. Resultados en DataFrame
results_df = pd.DataFrame(results)
# Ordenar por mejor RMSE
results_df = results_df.sort_values('RMSE Mean').reset_index(drop=True)
display(results_df)


Evaluando LightGBM…
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3600
[LightGBM] [Info] Number of data points in the train set: 1740, number of used features: 32
[LightGBM] [Info] Start training from score 280.904592
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3587
[LightGBM] [Info] Number of data points in the train set: 1740, number of used features: 32
[LightGBM] [Info] Start training from score 286.464581
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3585
[LightGBM] [Info] Number of data points in the train set: 1740, number of used features: 32
[LightGBM] [Info] Start training from score 287.221776
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3596
[LightGBM] [Info] Number of data points in the train set: 1740, num

In [None]:
# 8. Visualizaciones
# 8.1. Errores comparativos
metrics = ['RMSE Mean', 'MAE Mean', 'R2 Mean']
fig, axes = plt.subplots(1, len(metrics), figsize=(18, 5))
for ax, metric in zip(axes, metrics):
    ax.barh(results_df['Model'], results_df[metric])
    ax.set_title(metric)
    ax.invert_yaxis()
plt.tight_layout()
plt.show()
# 8.2 Tiempos
time_metrics = ['Training Time (s)', 'Prediction Time (s)']
fig, axes = plt.subplots(1, len(time_metrics), figsize=(12, 5))
for ax, tm in zip(axes, time_metrics):
    ax.barh(results_df['Model'], results_df[tm])
    ax.set_title(tm)
    ax.invert_yaxis()
plt.tight_layout()
plt.show()

# 9. Entrenamiento final del mejor modelo
top = results_df.loc[0, 'Model']
best_model = init_models()[top]
final_pipe = Pipeline([('preproc', preprocessor), ('model', best_model)])
final_pipe.fit(X_train, y_train)
print(f"Mejor modelo: {top}")
print(f"R² en test: {final_pipe.score(X_test, y_test):.4f}")

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.metrics import mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity

# 1. Lista de marcas conocidas que tenemos
brands_vinted = [
    'Balenciaga', 'Gucci', 'Jean Paul Gaultier', 'Dolce Gabbana',
    'Dries Van Noten', 'Acne Studios', 'Givenchy', 'Prada', 'Alyx', 'Marc Jacobs',
    'Valentino', 'Alexande McQueen', 'Yohji Yamamoto', 'Dior', 'Mihara Yasuhiro',
    'Louboutin', 'Miu Miu', 'Saint Laurent', 'Supreme', 'Dsquared2', 'Rick Owens',
    'Thierry Mugler', 'Issey Miyake', 'Vivienne Westwood', 'Margiela'
]

# 2. Cargar los datasets
vinted_data   = pd.read_csv('/kaggle/input/vinted-adaptao-a-vestiaire/Vinted_Scraper_adapted.csv')
vestiaire_data = pd.read_csv('/kaggle/input/vestiaire-filtrado-definitivo/Filtered_Vestiaire (4).csv')

# 3. Limpieza y relleno de NaN
vinted_data['product_condition'] = vinted_data['product_condition'].apply(lambda x: x if x else 'Unknown')
vestiaire_data['price_usd']      = vestiaire_data['price_usd'].fillna(0)
vinted_data['product_keywords']  = vinted_data['product_keywords'].fillna(vinted_data['product_name'])
vestiaire_data['product_keywords']= vestiaire_data['product_keywords'].fillna(vestiaire_data['product_name'])

# 4. Filtrar Vestiaire por marcas conocidas
filtered_vestiaire_data = vestiaire_data[
    vestiaire_data['product_keywords']
        .str.contains('|'.join(brands_vinted), case=False, na=False)
].head(10000)

print(f'Filtradas {filtered_vestiaire_data.shape[0]} filas de Vestiaire para entrenamiento')

# 5. Vectorización TF-IDF de texto
vectorizer_name     = TfidfVectorizer(stop_words='english')
vectorizer_keywords = TfidfVectorizer(stop_words='english')

X_name_tfidf       = vectorizer_name.fit_transform(filtered_vestiaire_data['product_name'])
X_keywords_tfidf   = vectorizer_keywords.fit_transform(filtered_vestiaire_data['product_keywords'])

X_vinted_name      = vectorizer_name.transform(vinted_data['product_name'])
X_vinted_keywords  = vectorizer_keywords.transform(vinted_data['product_keywords'])

X_train = hstack([X_name_tfidf, X_keywords_tfidf])
X_vinted= hstack([X_vinted_name, X_vinted_keywords])
y_train = filtered_vestiaire_data['price_usd']

# 6. Definir y entrenar modelos
# 6.1 LightGBM
lightgbm_model = lgb.LGBMRegressor(objective='regression', metric='mae')
lightgbm_model.fit(X_train, y_train)

# 6.2 XGBoost
xgboost_model = xgb.XGBRegressor(objective='reg:squarederror')
xgboost_model.fit(X_train, y_train)

# 6.3 CatBoost
catboost_model = cb.CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=5, verbose=0)
catboost_model.fit(X_train, y_train)

# 6.4 MLP (mejor según validación)
mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=200, random_state=71)
mlp_model.fit(X_train, y_train)

# 7. Predecir precios en Vinted
vinted_data['predicted_price_lightgbm'] = lightgbm_model.predict(X_vinted)
vinted_data['predicted_price_xgboost'] = xgboost_model.predict(X_vinted)
vinted_data['predicted_price_catboost']= catboost_model.predict(X_vinted)
vinted_data['predicted_price_mlp']     = mlp_model.predict(X_vinted)

# 8. Predicción por similitud coseno
similarities = cosine_similarity(X_vinted, X_train)

def estimate_price_from_similarity(idx, sims, vest_data, top_k=5):
    top_idxs = sims[idx].argsort()[::-1][:top_k]
    return vest_data.iloc[top_idxs]['price_usd'].mean()

vinted_data['predicted_price_similarity'] = [
    estimate_price_from_similarity(i, similarities, filtered_vestiaire_data)
    for i in range(len(vinted_data))
]

# 9. Ensamblar resultados finales
final_results = vinted_data[[
    'product_name', 'product_type', 'product_description', 'product_keywords',
    'product_condition', 'URL', 'price_usd',
    'predicted_price_lightgbm', 'predicted_price_xgboost',
    'predicted_price_catboost', 'predicted_price_mlp',
    'predicted_price_similarity'
]]

# 10. Exportar y visualizar
final_results.to_csv('/kaggle/working/Vinted_Price_Estimates_with_MLP2.csv', index=False)
final_results.head()




In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Cargar el dataset con los precios y predicciones
vinted_price_estimates = pd.read_csv('/kaggle/input/estimated-with-mlp/Vinted_Price_Estimates_with_MLP.csv')

# Calcular el promedio de todas las predicciones
vinted_price_estimates['predicted_price_avg'] = vinted_price_estimates[[
    'predicted_price_lightgbm',
    'predicted_price_xgboost',
    'predicted_price_catboost',
    'predicted_price_mlp',
    'predicted_price_similarity'
]].mean(axis=1)

# Categorizar según promedio de predicciones
def categorize_investment(price_usd, predicted_price_avg):
    if predicted_price_avg > 1.5 * price_usd:
        return 'High Investment'
    elif 1 <= predicted_price_avg <= 1.5 * price_usd:
        return 'Moderate Investment'
    else:
        return 'Low Investment'

vinted_price_estimates['investment_category'] = vinted_price_estimates.apply(
    lambda row: categorize_investment(row['price_usd'], row['predicted_price_avg']), axis=1
)

# Clustering usando todas las predicciones + precio real
X = vinted_price_estimates[[
    'price_usd',
    'predicted_price_lightgbm',
    'predicted_price_xgboost',
    'predicted_price_catboost',
    'predicted_price_mlp',
    'predicted_price_similarity'
]]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
vinted_price_estimates['investment_cluster'] = kmeans.fit_predict(X_scaled)

# Asignar etiquetas según el valor promedio predicho por cluster
cluster_order = vinted_price_estimates.groupby('investment_cluster')['predicted_price_avg'].mean().sort_values(ascending=False)
label_map = {cluster: label for cluster, label in zip(cluster_order.index, ['High Investment', 'Moderate Investment', 'Low Investment'])}
vinted_price_estimates['investment_category_from_cluster'] = vinted_price_estimates['investment_cluster'].map(label_map)

# Reordenar columnas
final_column_order = [
    'product_name', 'product_type', 'product_description', 'product_keywords', 'product_condition',
    'URL', 'price_usd', 'predicted_price_lightgbm', 'predicted_price_xgboost', 'predicted_price_catboost', 
    'predicted_price_mlp', 'predicted_price_similarity', 'predicted_price_avg',
    'investment_category', 'investment_category_from_cluster'
]

vinted_price_estimates = vinted_price_estimates[final_column_order]

# Guardar resultados
vinted_price_estimates.to_csv('/kaggle/working/Vinted_Investment_Categorization_ALL.csv', index=False)

