In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Cargar datos
url = '../Data/train.csv'
url2 = '../Data/test.csv'
train_df = pd.read_csv(url)
test_df = pd.read_csv(url2)

# Exploración inicial de los datos
print("\nInformación de train.csv:")
print(train_df.info())
print("\nPrimeras filas de train.csv:")
print(train_df.head())

# Preprocesamiento
# Eliminar columnas irrelevantes solo en train (no en test)
columns_to_drop = ["Product", "ScreenResolution", "Cpu", "Gpu", "Memory"]
train_df.drop(columns=columns_to_drop, inplace=True)

# Extraer características numéricas de 'Ram' y 'Weight'
train_df["Ram"] = train_df["Ram"].str.replace("GB", "").astype(int)
train_df["Weight"] = train_df["Weight"].str.replace("kg", "").astype(float)

test_df_processed = test_df.copy()
test_df_processed["Ram"] = test_df_processed["Ram"].str.replace("GB", "").astype(int)
test_df_processed["Weight"] = test_df_processed["Weight"].str.replace("kg", "").astype(float)

# Convertir variables categóricas en variables dummy
train_df = pd.get_dummies(train_df, columns=["Company", "TypeName", "OpSys"], drop_first=True)
test_df_processed = pd.get_dummies(test_df_processed, columns=["Company", "TypeName", "OpSys"], drop_first=True)

# Asegurar que test_df tenga las mismas columnas que train_df
for col in train_df.columns:
    if col not in test_df_processed.columns and col != "Price_euros":
        test_df_processed[col] = 0

# Alinear las columnas
X = train_df.drop(columns=["Price_euros"])
y = train_df["Price_euros"]
test_df_processed = test_df_processed[X.columns]

# División en train y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar modelo
model = RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_split=5, random_state=42)
model.fit(X_train, y_train)

# Evaluar el modelo
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"\nMAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# Generar predicciones para test.csv
test_predictions = model.predict(test_df_processed)
submission = pd.DataFrame({"laptop_ID": test_df["laptop_ID"], "Price_euros": test_predictions})
submission.to_csv("submission.csv", index=False)
print("\nArchivo 'submission.csv' generado exitosamente!")



Información de train.csv:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         912 non-null    int64  
 1   Company           912 non-null    object 
 2   Product           912 non-null    object 
 3   TypeName          912 non-null    object 
 4   Inches            912 non-null    float64
 5   ScreenResolution  912 non-null    object 
 6   Cpu               912 non-null    object 
 7   Ram               912 non-null    object 
 8   Memory            912 non-null    object 
 9   Gpu               912 non-null    object 
 10  OpSys             912 non-null    object 
 11  Weight            912 non-null    object 
 12  Price_euros       912 non-null    float64
dtypes: float64(2), int64(1), object(10)
memory usage: 92.8+ KB
None

Primeras filas de train.csv:
   laptop_ID Company             Product            TypeName  In