In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import urllib.request

# Data

In [2]:
url = '../Data/train.csv'
url2 = '../Data/test.csv'
train_df = pd.read_csv(url)
test = pd.read_csv(url2)

In [3]:
train_df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1223,Dell,Inspiron 5567,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,AMD Radeon R7 M445,Windows 10,2.36kg,889.0
1,78,Lenovo,IdeaPad 320-15IKBN,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,2TB HDD,Intel HD Graphics 620,No OS,2.2kg,519.0
2,1267,Dell,XPS 13,2 in 1 Convertible,13.3,Quad HD+ / Touchscreen 3200x1800,Intel Core i5 7Y54 1.2GHz,8GB,256GB SSD,Intel HD Graphics 615,Windows 10,1.24kg,1813.0
3,161,Dell,Inspiron 5579,2 in 1 Convertible,15.6,Full HD / Touchscreen 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,256GB SSD,Intel UHD Graphics 620,Windows 10,1.56kg,1049.0
4,922,LG,Gram 14Z970,Ultrabook,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,512GB SSD,Intel HD Graphics 620,Windows 10,0.98kg,1899.0


In [4]:
print("\nInformación de train.csv:")
print(train_df.info())
print("\nPrimeras filas de train.csv:")
print(train_df.head())


Información de train.csv:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         912 non-null    int64  
 1   Company           912 non-null    object 
 2   Product           912 non-null    object 
 3   TypeName          912 non-null    object 
 4   Inches            912 non-null    float64
 5   ScreenResolution  912 non-null    object 
 6   Cpu               912 non-null    object 
 7   Ram               912 non-null    object 
 8   Memory            912 non-null    object 
 9   Gpu               912 non-null    object 
 10  OpSys             912 non-null    object 
 11  Weight            912 non-null    object 
 12  Price_euros       912 non-null    float64
dtypes: float64(2), int64(1), object(10)
memory usage: 92.8+ KB
None

Primeras filas de train.csv:
   laptop_ID Company             Product            TypeName  In

## Preprocesado

In [5]:
columns_to_drop = ["Product", "ScreenResolution", "Cpu", "Gpu", "Memory"]
train_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
test.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Extraer categorias 

In [6]:
train_df["Ram"] = train_df["Ram"].str.replace("GB", "").astype(int)
train_df["Weight"] = train_df["Weight"].str.replace("kg", "").astype(float)

test_df = test.copy()
test_df["Ram"] = test_df["Ram"].str.replace("GB", "").astype(int)
test_df["Weight"] = test_df["Weight"].str.replace("kg", "").astype(float)

## Variable categoricas

In [7]:
train_df = pd.get_dummies(train_df, columns=["Company", "TypeName", "OpSys"], drop_first=True)
test_df = pd.get_dummies(test_df, columns=["Company", "TypeName", "OpSys"], drop_first=True)

In [8]:
for col in train_df.columns:
    if col not in test_df.columns and col != "Price_euros":
        test_df[col] = 0

# Definir X e y

In [9]:
X = train_df.drop(columns=["Price_euros"])
y = train_df["Price_euros"]
test_df = test_df[X.columns]

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_val = X_val.apply(pd.to_numeric, errors='coerce')

In [12]:
X_train.fillna(0, inplace=True)
X_val.fillna(0, inplace=True)

# tRAIN MODELO

In [13]:
model = RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_split=5, random_state=42)
model.fit(X_train, y_train)

## Evaluar Modelo

In [17]:
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {rmse:.2f}")

RMSE: 404.55


In [18]:
test_predictions = model.predict(test_df)
submission = pd.DataFrame({"laptop_ID": test_df["laptop_ID"], "Price_euros": test_predictions})
submission.to_csv("submission.csv", index=False)
print("\nArchivo 'submission.csv' generado exitosamente!")


Archivo 'submission.csv' generado exitosamente!
