In [46]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [47]:
df = pd.read_csv("./compra-de-compus/train.csv")
df.index.name = None

df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_in_euros
0,755,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i3 6006U 2GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 10,1.86kg,539.0
1,618,Dell,Inspiron 7559,Gaming,15.6,Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,16GB,1TB HDD,Nvidia GeForce GTX 960<U+039C>,Windows 10,2.59kg,879.01
2,909,HP,ProBook 450,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,1TB HDD,Nvidia GeForce 930MX,Windows 10,2.04kg,900.0
3,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
4,286,Dell,Inspiron 3567,Notebook,15.6,Full HD 1920x1080,Intel Core i3 6006U 2.0GHz,4GB,1TB HDD,AMD Radeon R5 M430,Linux,2.25kg,428.0


In [48]:
df["Product"].value_counts()

Product
XPS 13                                        23
Inspiron 3567                                 22
Legion Y520-15IKBN                            15
Vostro 3568                                   14
250 G6                                        13
                                              ..
15-AY023na (N3710/8GB/2TB/W10)                 1
R558UA-DM966T (i5-7200U/8GB/128GB/FHD/W10)     1
Precision 3510                                 1
Pavilion X360                                  1
GL62M 7RD                                      1
Name: count, Length: 480, dtype: int64

In [49]:
df["ScreenResolution"].value_counts()

ScreenResolution
Full HD 1920x1080                                349
1366x768                                         211
IPS Panel Full HD 1920x1080                      163
IPS Panel Full HD / Touchscreen 1920x1080         32
Full HD / Touchscreen 1920x1080                   30
1600x900                                          14
Quad HD+ / Touchscreen 3200x1800                  11
Touchscreen 1366x768                              11
IPS Panel 4K Ultra HD / Touchscreen 3840x2160     10
4K Ultra HD / Touchscreen 3840x2160                7
IPS Panel Quad HD+ / Touchscreen 3200x1800         6
Touchscreen 2560x1440                              6
IPS Panel 4K Ultra HD 3840x2160                    5
Touchscreen 2256x1504                              5
IPS Panel Retina Display 2560x1600                 5
1440x900                                           4
IPS Panel 1366x768                                 4
IPS Panel Retina Display 2304x1440                 4
IPS Panel Touchscreen 2560x14

In [50]:
# Dividir el DataFrame en conjunto de entrenamiento y conjunto de prueba
X = df.drop(columns=['Price_in_euros'])  # Features
y = df['Price_in_euros']  # Target variable

In [51]:
# Split train-test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
X_train['ram_gb'] = X_train['Ram'].str.replace("GB","").astype(int)
X_train['peso'] = X_train['Weight'].str.replace("kg","").astype(float)

In [53]:
mapeo_marcas = {
    'Lenovo': 1,
    'Dell': 2,
    'HP': 3,
    'Asus': 4,
    'Acer': 5,
    'MSI': 6,
    'Toshiba': 7,
    'Apple': 8,
    'Razer': 9,
    'Mediacom': 10,
    'Microsoft': 11,
    'Samsung': 12,
    'Xiaomi': 13,
    'Vero': 14,
    'Huawei': 15,
    'Google': 16,
    'Chuwi': 17,
    'Fujitsu': 18,
    'LG': 19
}
X_train['Company_numerico'] = X_train['Company'].map(mapeo_marcas)

In [54]:
mapeo_tipos = {
    'Notebook': 1,
    'Gaming': 2,
    'Ultrabook': 3,
    '2 in 1 Convertible': 4,
    'Workstation': 5,
    'Netbook': 6,
    
}
X_train['TypeName_numerico'] = X_train['TypeName'].map(mapeo_tipos)

In [55]:
mapeo_opsys = {
    'Windows 10': 1,
    'Linux': 2,
    'No OS': 3,
    'Windows 7': 4,
    'Chrome OS': 5,
    'macOS': 6,
    'Windows 10 S': 7,
    'Mac OS X': 8,
    'Android': 9
}
X_train['OpSys_numerico'] = X_train['OpSys'].map(mapeo_opsys)

In [56]:
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)

In [57]:
X_train['is_FullHD'] = X_train['ScreenResolution'].str.contains('1920x1080') | X_train['ScreenResolution'].str.contains('1920x1200')
X_train['is_4K'] = X_train['ScreenResolution'].str.contains('3840x2160')

# Convertir los valores booleanos a 0 y 1
X_train['is_FullHD'] = X_train['is_FullHD'].astype(int)
X_train['is_4K'] = X_train['is_4K'].astype(int)

In [58]:
X_train["is_intel"] = X_train["Cpu"].str.contains("intel")|X_train["Cpu"].str.contains("Intel")
X_train["is_AMD"] = X_train["Cpu"].str.contains("AMD")

X_train["is_AMD"] = X_train["is_AMD"].astype(int)
X_train["is_intel"] = X_train ["is_intel"].astype(int)

X_train["is_i9"] = X_train["Cpu"].str.contains("i9")
X_train["is_i7"] = X_train["Cpu"].str.contains("i7")
X_train["is_i5"] = X_train["Cpu"].str.contains("i5")
X_train["is_i3"] = X_train["Cpu"].str.contains("i3")
X_train["is_celeron"] = X_train["Cpu"].str.contains("Celeron")
X_train["is_Pentium"] = X_train["Cpu"].str.contains("Pentium")
X_train["is_Ryzen"] = X_train["Cpu"].str.contains("Ryzen")
X_train["is_A10"] = X_train["Cpu"].str.contains("A10")
X_train["is_Athlon"] = X_train["Cpu"].str.contains("Athlon")
X_train["is_FX"] = X_train["Cpu"].str.contains("FX")
X_train["is_A6"] = X_train["Cpu"].str.contains("A6")
columns_to_convert = ["is_i9", "is_i7", "is_i5", "is_i3", "is_celeron", "is_Pentium", "is_Ryzen", "is_A10", "is_Athlon", "is_FX", "is_A6"]
X_train[columns_to_convert] = X_train[columns_to_convert].astype(int)

In [59]:
X_train['GHz'] = X_train['Cpu'].str.extract(r'(\d+\.\d+)GHz')

# Convertir los valores extraídos a tipo numérico (float)
X_train['GHz'] = X_train['GHz'].astype(float)

In [60]:
mean_GHz = X_train['GHz'].mean()

# Llenar los valores nulos con la media calculada
X_train['GHz'] = X_train['GHz'].fillna(mean_GHz)

In [61]:
# Crear columna para identificar la marca (AMD o Intel)
X_train['is_AMD_gpu'] = X_train['Gpu'].str.contains("AMD").astype(int)
X_train['is_Nvidia_gpu'] = X_train['Gpu'].str.contains("Nvidia").astype(int)

# Crear columnas para identificar modelos específicos
X_train['is_HD_Graphics'] = X_train['Gpu'].str.contains("HD Graphics").astype(int)
X_train['is_GeForce'] = X_train['Gpu'].str.contains("GeForce").astype(int)
X_train['is_Radeon'] = X_train['Gpu'].str.contains("Radeon").astype(int)
X_train['is_Iris'] = X_train['Gpu'].str.contains("Iris").astype(int)
X_train['is_FirePro'] = X_train['Gpu'].str.contains("FirePro").astype(int)
X_train['is_Quadro'] = X_train['Gpu'].str.contains("Quadro").astype(int)

# Convertir todas las columnas creadas a tipo int
columns_to_convert = ['is_AMD_gpu', 'is_Nvidia_gpu', 'is_HD_Graphics', 'is_GeForce', 'is_Radeon', 'is_Iris', 'is_FirePro', 'is_Quadro']
X_train[columns_to_convert] = X_train[columns_to_convert].astype(int)

In [62]:
X_train.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,...,is_A6,GHz,is_AMD_gpu,is_Nvidia_gpu,is_HD_Graphics,is_GeForce,is_Radeon,is_Iris,is_FirePro,is_Quadro
25,1118,HP,ZBook 17,Workstation,17.3,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,8GB,1TB HDD,AMD FirePro W6150M,...,0,2.6,1,0,0,0,0,0,1,0
84,153,Dell,Inspiron 5577,Gaming,15.6,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,512GB SSD,Nvidia GeForce GTX 1050,...,0,2.8,0,1,0,1,0,0,0,0
10,275,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.9GHz,8GB,512GB SSD,Intel Iris Graphics 550,...,0,2.9,0,0,0,0,0,1,0,0
342,1100,HP,EliteBook 840,Notebook,14.0,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,500GB HDD,Intel HD Graphics 520,...,0,2.3,0,0,1,0,0,0,0,0
890,131,Dell,Inspiron 5770,Notebook,17.3,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,16GB,256GB SSD + 2TB HDD,AMD Radeon 530,...,0,1.8,1,0,0,0,1,0,0,0


In [63]:
X_train['HDD'] = X_train['Memory'].str.extract(r'(\d+\.?\d*)TB HDD').fillna(0).astype(float) * 1024 + X_train['Memory'].str.extract(r'(\d+\.?\d*)GB HDD').fillna(0).astype(float)
X_train['SSD'] = X_train['Memory'].str.extract(r'(\d+\.?\d*)TB SSD').fillna(0).astype(float) * 1024 + X_train['Memory'].str.extract(r'(\d+\.?\d*)GB SSD').fillna(0).astype(float)
X_train['Flash_Storage'] = X_train['Memory'].str.extract(r'(\d+\.?\d*)GB Flash Storage').fillna(0).astype(float)
X_train['Hybrid'] = X_train['Memory'].str.extract(r'(\d+\.?\d*)TB Hybrid').fillna(0).astype(float) * 1024 + X_train['Memory'].str.extract(r'(\d+\.?\d*)GB Hybrid').fillna(0).astype(float)

In [64]:
X_train.columns 

Index(['laptop_ID', 'Company', 'Product', 'TypeName', 'Inches',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight',
       'ram_gb', 'peso', 'Company_numerico', 'TypeName_numerico',
       'OpSys_numerico', 'is_FullHD', 'is_4K', 'is_intel', 'is_AMD', 'is_i9',
       'is_i7', 'is_i5', 'is_i3', 'is_celeron', 'is_Pentium', 'is_Ryzen',
       'is_A10', 'is_Athlon', 'is_FX', 'is_A6', 'GHz', 'is_AMD_gpu',
       'is_Nvidia_gpu', 'is_HD_Graphics', 'is_GeForce', 'is_Radeon', 'is_Iris',
       'is_FirePro', 'is_Quadro', 'HDD', 'SSD', 'Flash_Storage', 'Hybrid'],
      dtype='object')

In [65]:
X_train.drop(columns=['Company', 'Product', 'TypeName','ScreenResolution', 'Cpu','Ram', 'Memory', 'Gpu', 'OpSys', 'Weight'],inplace = True)

In [66]:
X_train.head()

Unnamed: 0,laptop_ID,Inches,ram_gb,peso,Company_numerico,TypeName_numerico,OpSys_numerico,is_FullHD,is_4K,is_intel,...,is_HD_Graphics,is_GeForce,is_Radeon,is_Iris,is_FirePro,is_Quadro,HDD,SSD,Flash_Storage,Hybrid
25,1118,17.3,8,3.0,3,5,4,1,0,1,...,0,0,0,0,1,0,1024.0,0.0,0.0,0.0
84,153,15.6,16,2.56,2,2,1,1,0,1,...,0,1,0,0,0,0,0.0,512.0,0.0,0.0
10,275,13.3,8,1.37,8,3,6,0,0,1,...,0,0,0,1,0,0,0.0,512.0,0.0,0.0
342,1100,14.0,4,1.54,3,1,4,1,0,1,...,1,0,0,0,0,0,500.0,0.0,0.0,0.0
890,131,17.3,16,2.8,2,1,1,1,0,1,...,0,0,1,0,0,0,2048.0,256.0,0.0,0.0


In [67]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 729 entries, 25 to 102
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   laptop_ID          729 non-null    int64  
 1   Inches             729 non-null    float64
 2   ram_gb             729 non-null    int32  
 3   peso               729 non-null    float64
 4   Company_numerico   729 non-null    int64  
 5   TypeName_numerico  729 non-null    int64  
 6   OpSys_numerico     729 non-null    int64  
 7   is_FullHD          729 non-null    int32  
 8   is_4K              729 non-null    int32  
 9   is_intel           729 non-null    int32  
 10  is_AMD             729 non-null    int32  
 11  is_i9              729 non-null    int32  
 12  is_i7              729 non-null    int32  
 13  is_i5              729 non-null    int32  
 14  is_i3              729 non-null    int32  
 15  is_celeron         729 non-null    int32  
 16  is_Pentium         729 non-nul

In [68]:
X_train["GHz"].value_counts()

GHz
2.500000    156
2.700000     95
2.800000     94
1.600000     92
2.300000     49
1.800000     44
2.285335     43
2.600000     36
1.100000     30
2.400000     27
2.900000     12
2.000000     11
1.200000      7
1.500000      6
2.200000      6
1.440000      6
1.300000      4
3.600000      4
0.900000      3
2.100000      2
3.100000      1
1.900000      1
Name: count, dtype: int64

In [69]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)

In [70]:
from sklearn.ensemble import GradientBoostingRegressor
# Inicializar el modelo de Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Entrenar el modelo
gb_regressor.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred = gb_regressor.predict(X_train)

# Calcular el error cuadrático medio
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print("Error cuadrático medio:", rmse)

Error cuadrático medio: 184.6553567728056


In [71]:
from sklearn.linear_model import LinearRegression
# Inicializar el modelo de regresión lineal
linear_regressor = LinearRegression()

# Entrenar el modelo
linear_regressor.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred = linear_regressor.predict(X_train)

# Calcular el error cuadrático medio
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print("Error cuadrático medio:", rmse)

Error cuadrático medio: 332.26281013234916


In [72]:
y_pred = rf_reg.predict(X_train)

# Calcular el error cuadrático medio
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print(f"RandomForest con el rmse es igual {rmse}")

RandomForest con el rmse es igual 106.27044320895163


In [73]:
'''from scipy.stats import randint


param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Realizar la búsqueda en cuadrícula
grid_search = GridSearchCV(estimator=rf_reg, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Mostrar los mejores hiperparámetros encontrados
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)
'''

'from scipy.stats import randint\n\n\nparam_grid = {\n    \'n_estimators\': [10, 50, 100],\n    \'max_depth\': [None, 10, 20],\n    \'min_samples_split\': [2, 5, 10],\n    \'min_samples_leaf\': [1, 2, 4]\n}\n\n# Realizar la búsqueda en cuadrícula\ngrid_search = GridSearchCV(estimator=rf_reg, param_grid=param_grid, cv=5)\ngrid_search.fit(X_train, y_train)\n\n# Mostrar los mejores hiperparámetros encontrados\nprint("Mejores hiperparámetros encontrados:")\nprint(grid_search.best_params_)\n'

In [75]:
# Define el modelo RandomForestClassifier y los parámetros a ajustar

param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Realiza la búsqueda en cuadrícula
grid_search = GridSearchCV(estimator=rf_reg, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Muestra los mejores hiperparámetros encontrados
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)

810 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
810 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\adrif\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\adrif\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\adrif\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\adrif\AppData\Local\Programs\Python\Python310\

Mejores hiperparámetros encontrados:
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}


In [None]:
rf_reg_optimized = RandomForestRegressor(
    max_depth=20,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=100
)

# Entrenar el modelo con los datos de entrenamiento
rf_reg_optimized.fit(X_train, y_train)

# Calcular el error cuadrático medio
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print(f"RandomForest con el rmse es igual {rmse}")

RandomForest con el rmse es igual 108.54437974864206


In [None]:
X_test['ram_gb'] = X_test['Ram'].str.replace("GB","").astype(int)
X_test['peso'] = X_test['Weight'].str.replace("kg","").astype(float)
X_test['ram_gb'] = X_test['Ram'].str.replace("GB","").astype(int)
X_test['peso'] = X_test['Weight'].str.replace("kg","").astype(float)
mapeo_marcas = {
    'Lenovo': 1,
    'Dell': 2,
    'HP': 3,
    'Asus': 4,
    'Acer': 5,
    'MSI': 6,
    'Toshiba': 7,
    'Apple': 8,
    'Razer': 9,
    'Mediacom': 10,
    'Microsoft': 11,
    'Samsung': 12,
    'Xiaomi': 13,
    'Vero': 14,
    'Huawei': 15,
    'Google': 16,
    'Chuwi': 17,
    'Fujitsu': 18,
    'LG': 19
}
X_test['Company_numerico'] = X_test['Company'].map(mapeo_marcas)
mapeo_tipos = {
    'Notebook': 1,
    'Gaming': 2,
    'Ultrabook': 3,
    '2 in 1 Convertible': 4,
    'Workstation': 5,
    'Netbook': 6,
    
}
X_test['TypeName_numerico'] = X_test['TypeName'].map(mapeo_tipos)
mapeo_opsys = {
    'Windows 10': 1,
    'Linux': 2,
    'No OS': 3,
    'Windows 7': 4,
    'Chrome OS': 5,
    'macOS': 6,
    'Windows 10 S': 7,
    'Mac OS X': 8,
    'Android': 9
}
X_test['OpSys_numerico'] = X_test['OpSys'].map(mapeo_opsys)
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)
X_test['is_FullHD'] = X_test['ScreenResolution'].str.contains('1920x1080') | X_test['ScreenResolution'].str.contains('1920x1200')
X_test['is_4K'] = X_test['ScreenResolution'].str.contains('3840x2160')

# Convertir los valores booleanos a 0 y 1
X_test['is_FullHD'] = X_test['is_FullHD'].astype(int)
X_test['is_4K'] = X_test['is_4K'].astype(int)
X_test['is_FullHD'] = X_test['ScreenResolution'].str.contains('1920x1080') | X_test['ScreenResolution'].str.contains('1920x1200')
X_test['is_4K'] = X_test['ScreenResolution'].str.contains('3840x2160')

# Convertir los valores booleanos a 0 y 1
X_test['is_FullHD'] = X_test['is_FullHD'].astype(int)
X_test['is_4K'] = X_test['is_4K'].astype(int)
X_test["is_intel"] = X_test["Cpu"].str.contains("intel")|X_test["Cpu"].str.contains("Intel")
X_test["is_AMD"] = X_test["Cpu"].str.contains("AMD")

X_test["is_AMD"] = X_test["is_AMD"].astype(int)
X_test["is_intel"] = X_test ["is_intel"].astype(int)

X_test["is_i9"] = X_test["Cpu"].str.contains("i9")
X_test["is_i7"] = X_test["Cpu"].str.contains("i7")
X_test["is_i5"] = X_test["Cpu"].str.contains("i5")
X_test["is_i3"] = X_test["Cpu"].str.contains("i3")
X_test["is_celeron"] = X_test["Cpu"].str.contains("Celeron")
X_test["is_Pentium"] = X_test["Cpu"].str.contains("Pentium")
X_test["is_Ryzen"] = X_test["Cpu"].str.contains("Ryzen")
X_test["is_A10"] = X_test["Cpu"].str.contains("A10")
X_test["is_Athlon"] = X_test["Cpu"].str.contains("Athlon")
X_test["is_FX"] = X_test["Cpu"].str.contains("FX")
X_test["is_A6"] = X_test["Cpu"].str.contains("A6")
columns_to_convert = ["is_i9", "is_i7", "is_i5", "is_i3", "is_celeron", "is_Pentium", "is_Ryzen", "is_A10", "is_Athlon", "is_FX", "is_A6"]
X_test[columns_to_convert] = X_test[columns_to_convert].astype(int)
X_test['GHz'] = X_test['Cpu'].str.extract(r'(\d+\.\d+)GHz')

# Convertir los valores extraídos a tipo numérico (float)
X_test['GHz'] = X_test['GHz'].astype(float)
mean_GHz = X_test['GHz'].mean()

# Llenar los valores nulos con la media calculada
X_test['GHz'] = X_test['GHz'].fillna(mean_GHz)
# Crear columna para identificar la marca (AMD o Intel)
X_test['is_AMD_gpu'] = X_test['Gpu'].str.contains("AMD").astype(int)
X_test['is_Nvidia_gpu'] = X_test['Gpu'].str.contains("Nvidia").astype(int)

# Crear columnas para identificar modelos específicos
X_test['is_HD_Graphics'] = X_test['Gpu'].str.contains("HD Graphics").astype(int)
X_test['is_GeForce'] = X_test['Gpu'].str.contains("GeForce").astype(int)
X_test['is_Radeon'] = X_test['Gpu'].str.contains("Radeon").astype(int)
X_test['is_Iris'] = X_test['Gpu'].str.contains("Iris").astype(int)
X_test['is_FirePro'] = X_test['Gpu'].str.contains("FirePro").astype(int)
X_test['is_Quadro'] = X_test['Gpu'].str.contains("Quadro").astype(int)

# Convertir todas las columnas creadas a tipo int
columns_to_convert = ['is_AMD_gpu', 'is_Nvidia_gpu', 'is_HD_Graphics', 'is_GeForce', 'is_Radeon', 'is_Iris', 'is_FirePro', 'is_Quadro']
X_test[columns_to_convert] = X_test[columns_to_convert].astype(int)
X_test['HDD'] = X_test['Memory'].str.extract(r'(\d+\.?\d*)TB HDD').fillna(0).astype(float) * 1024 + X_test['Memory'].str.extract(r'(\d+\.?\d*)GB HDD').fillna(0).astype(float)
X_test['SSD'] = X_test['Memory'].str.extract(r'(\d+\.?\d*)TB SSD').fillna(0).astype(float) * 1024 + X_test['Memory'].str.extract(r'(\d+\.?\d*)GB SSD').fillna(0).astype(float)
X_test['Flash_Storage'] = X_test['Memory'].str.extract(r'(\d+\.?\d*)GB Flash Storage').fillna(0).astype(float)
X_test['Hybrid'] = X_test['Memory'].str.extract(r'(\d+\.?\d*)TB Hybrid').fillna(0).astype(float) * 1024 + X_test['Memory'].str.extract(r'(\d+\.?\d*)GB Hybrid').fillna(0).astype(float)
X_test.drop(columns=['Company', 'Product', 'TypeName','ScreenResolution', 'Cpu','Ram', 'Memory', 'Gpu', 'OpSys', 'Weight'],inplace = True)

In [None]:
X_test.columns

Index(['laptop_ID', 'Inches', 'ram_gb', 'peso', 'Company_numerico',
       'TypeName_numerico', 'OpSys_numerico', 'is_FullHD', 'is_4K', 'is_intel',
       'is_AMD', 'is_i9', 'is_i7', 'is_i5', 'is_i3', 'is_celeron',
       'is_Pentium', 'is_Ryzen', 'is_A10', 'is_Athlon', 'is_FX', 'is_A6',
       'GHz', 'is_AMD_gpu', 'is_Nvidia_gpu', 'is_HD_Graphics', 'is_GeForce',
       'is_Radeon', 'is_Iris', 'is_FirePro', 'is_Quadro', 'HDD', 'SSD',
       'Flash_Storage', 'Hybrid'],
      dtype='object')

In [None]:
# Obtener las columnas ordenadas de X_train
#ordered_columns = X_train.columns

# Reordenar las columnas de X_test
#X_test = X_test.reindex(columns=ordered_columns)


In [None]:
y_pred = rf_reg_optimized.predict(X_test)

# Calcular el error cuadrático medio
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"RandomForest con el rmse es igual {rmse}")

RandomForest con el rmse es igual 329.76587022498245


In [None]:
y_pred = rf_reg.predict(X_test)

# Calcular el error cuadrático medio
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"RandomForest con el rmse es igual {rmse}")

RandomForest con el rmse es igual 337.22431807469843


In [None]:
df_test = pd.read_csv("./compra-de-compus/test.csv")
df_test.index.name = None
df_test.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight
0,209,Lenovo,Legion Y520-15IKBN,Gaming,15.6,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,512GB SSD,Nvidia GeForce GTX 1060,No OS,2.4kg
1,1281,Acer,Aspire ES1-531,Notebook,15.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,4GB,500GB HDD,Intel HD Graphics 400,Linux,2.4kg
2,1168,Lenovo,V110-15ISK (i3-6006U/4GB/1TB/No,Notebook,15.6,1366x768,Intel Core i3 6006U 2.0GHz,4GB,1TB HDD,Intel HD Graphics 520,No OS,1.9kg
3,1231,Dell,Inspiron 7579,2 in 1 Convertible,15.6,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,2.191kg
4,1020,HP,ProBook 640,Notebook,14.0,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,4GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.95kg


In [None]:
df_test['ram_gb'] = df_test['Ram'].str.replace("GB","").astype(int)
df_test['peso'] = df_test['Weight'].str.replace("kg","").astype(float)
df_test['ram_gb'] = df_test['Ram'].str.replace("GB","").astype(int)
df_test['peso'] = df_test['Weight'].str.replace("kg","").astype(float)
mapeo_marcas = {
    'Lenovo': 1,
    'Dell': 2,
    'HP': 3,
    'Asus': 4,
    'Acer': 5,
    'MSI': 6,
    'Toshiba': 7,
    'Apple': 8,
    'Razer': 9,
    'Mediacom': 10,
    'Microsoft': 11,
    'Samsung': 12,
    'Xiaomi': 13,
    'Vero': 14,
    'Huawei': 15,
    'Google': 16,
    'Chuwi': 17,
    'Fujitsu': 18,
    'LG': 19
}
df_test['Company_numerico'] = df_test['Company'].map(mapeo_marcas)
mapeo_tipos = {
    'Notebook': 1,
    'Gaming': 2,
    'Ultrabook': 3,
    '2 in 1 Convertible': 4,
    'Workstation': 5,
    'Netbook': 6,
    
}
df_test['TypeName_numerico'] = df_test['TypeName'].map(mapeo_tipos)
mapeo_opsys = {
    'Windows 10': 1,
    'Linux': 2,
    'No OS': 3,
    'Windows 7': 4,
    'Chrome OS': 5,
    'macOS': 6,
    'Windows 10 S': 7,
    'Mac OS X': 8,
    'Android': 9
}
df_test['OpSys_numerico'] = df_test['OpSys'].map(mapeo_opsys)
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)
df_test['is_FullHD'] = df_test['ScreenResolution'].str.contains('1920x1080') | df_test['ScreenResolution'].str.contains('1920x1200')
df_test['is_4K'] = df_test['ScreenResolution'].str.contains('3840x2160')

# Convertir los valores booleanos a 0 y 1
df_test['is_FullHD'] = df_test['is_FullHD'].astype(int)
df_test['is_4K'] = df_test['is_4K'].astype(int)
df_test['is_FullHD'] = df_test['ScreenResolution'].str.contains('1920x1080') | df_test['ScreenResolution'].str.contains('1920x1200')
df_test['is_4K'] = df_test['ScreenResolution'].str.contains('3840x2160')

# Convertir los valores booleanos a 0 y 1
df_test['is_FullHD'] = df_test['is_FullHD'].astype(int)
df_test['is_4K'] = df_test['is_4K'].astype(int)
df_test["is_intel"] = df_test["Cpu"].str.contains("intel")|df_test["Cpu"].str.contains("Intel")
df_test["is_AMD"] = df_test["Cpu"].str.contains("AMD")

df_test["is_AMD"] = df_test["is_AMD"].astype(int)
df_test["is_intel"] = df_test ["is_intel"].astype(int)

df_test["is_i9"] = df_test["Cpu"].str.contains("i9")
df_test["is_i7"] = df_test["Cpu"].str.contains("i7")
df_test["is_i5"] = df_test["Cpu"].str.contains("i5")
df_test["is_i3"] = df_test["Cpu"].str.contains("i3")
df_test["is_celeron"] = df_test["Cpu"].str.contains("Celeron")
df_test["is_Pentium"] = df_test["Cpu"].str.contains("Pentium")
df_test["is_Ryzen"] = df_test["Cpu"].str.contains("Ryzen")
df_test["is_A10"] = df_test["Cpu"].str.contains("A10")
df_test["is_Athlon"] = df_test["Cpu"].str.contains("Athlon")
df_test["is_FX"] = df_test["Cpu"].str.contains("FX")
df_test["is_A6"] = df_test["Cpu"].str.contains("A6")
columns_to_convert = ["is_i9", "is_i7", "is_i5", "is_i3", "is_celeron", "is_Pentium", "is_Ryzen", "is_A10", "is_Athlon", "is_FX", "is_A6"]
df_test[columns_to_convert] = df_test[columns_to_convert].astype(int)
df_test['GHz'] = df_test['Cpu'].str.extract(r'(\d+\.\d+)GHz')

# Convertir los valores extraídos a tipo numérico (float)
df_test['GHz'] = df_test['GHz'].astype(float)
mean_GHz = df_test['GHz'].mean()

# Llenar los valores nulos con la media calculada
df_test['GHz'] = df_test['GHz'].fillna(mean_GHz)
# Crear columna para identificar la marca (AMD o Intel)
df_test['is_AMD_gpu'] = df_test['Gpu'].str.contains("AMD").astype(int)
df_test['is_Nvidia_gpu'] = df_test['Gpu'].str.contains("Nvidia").astype(int)

# Crear columnas para identificar modelos específicos
df_test['is_HD_Graphics'] = df_test['Gpu'].str.contains("HD Graphics").astype(int)
df_test['is_GeForce'] = df_test['Gpu'].str.contains("GeForce").astype(int)
df_test['is_Radeon'] = df_test['Gpu'].str.contains("Radeon").astype(int)
df_test['is_Iris'] = df_test['Gpu'].str.contains("Iris").astype(int)
df_test['is_FirePro'] = df_test['Gpu'].str.contains("FirePro").astype(int)
df_test['is_Quadro'] = df_test['Gpu'].str.contains("Quadro").astype(int)

# Convertir todas las columnas creadas a tipo int
columns_to_convert = ['is_AMD_gpu', 'is_Nvidia_gpu', 'is_HD_Graphics', 'is_GeForce', 'is_Radeon', 'is_Iris', 'is_FirePro', 'is_Quadro']
df_test[columns_to_convert] = df_test[columns_to_convert].astype(int)
df_test['HDD'] = df_test['Memory'].str.extract(r'(\d+\.?\d*)TB HDD').fillna(0).astype(float) * 1024 + df_test['Memory'].str.extract(r'(\d+\.?\d*)GB HDD').fillna(0).astype(float)
df_test['SSD'] = df_test['Memory'].str.extract(r'(\d+\.?\d*)TB SSD').fillna(0).astype(float) * 1024 + df_test['Memory'].str.extract(r'(\d+\.?\d*)GB SSD').fillna(0).astype(float)
df_test['Flash_Storage'] = df_test['Memory'].str.extract(r'(\d+\.?\d*)GB Flash Storage').fillna(0).astype(float)
df_test['Hybrid'] = df_test['Memory'].str.extract(r'(\d+\.?\d*)TB Hybrid').fillna(0).astype(float) * 1024 + df_test['Memory'].str.extract(r'(\d+\.?\d*)GB Hybrid').fillna(0).astype(float)
df_test.drop(columns=['Company', 'Product', 'TypeName','ScreenResolution', 'Cpu','Ram', 'Memory', 'Gpu', 'OpSys', 'Weight'],inplace = True)

In [None]:
df_test.head()

Unnamed: 0,laptop_ID,Inches,ram_gb,peso,Company_numerico,TypeName_numerico,OpSys_numerico,is_FullHD,is_4K,is_intel,...,is_HD_Graphics,is_GeForce,is_Radeon,is_Iris,is_FirePro,is_Quadro,HDD,SSD,Flash_Storage,Hybrid
0,209,15.6,16,2.4,1,2,3,1,0,1,...,0,1,0,0,0,0,0.0,512.0,0.0,0.0
1,1281,15.6,4,2.4,5,1,2,0,0,1,...,1,0,0,0,0,0,500.0,0.0,0.0,0.0
2,1168,15.6,4,1.9,1,1,3,0,0,1,...,1,0,0,0,0,0,1024.0,0.0,0.0,0.0
3,1231,15.6,8,2.191,2,4,1,1,0,1,...,1,0,0,0,0,0,0.0,256.0,0.0,0.0
4,1020,14.0,4,1.95,3,1,1,1,0,1,...,1,0,0,0,0,0,0.0,256.0,0.0,0.0


In [None]:
prediccion= rf_reg.predict(df_test)

# Calcular el error cuadrático medio


In [None]:
submission = pd.DataFrame({"laptop_ID":df_test.laptop_ID,"Price_in_euros":prediccion})
submission.shape


(391, 2)

In [None]:
sample = pd.read_csv("./compra-de-compus/sample_submission.csv")

In [None]:
submission.head()

Unnamed: 0,laptop_ID,Price_in_euros
0,209,1401.8009
1,1281,297.5339
2,1168,395.9284
3,1231,1133.4955
4,1020,1038.3083


In [None]:
import urllib.request


In [None]:
def chequeador(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.laptop_ID.all() == sample.laptop_ID.all():
                print("You're ready to submit!")
                submission.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://www.mihaileric.com/static/evaluation-meme-e0a350f278a36346e6d46b139b1d0da0-ed51e.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Iván y Manuel: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `laptops_test.csv`. Lloramos.")

In [None]:
ruta_archivo = "./compra-de-compus/submission.csv"

# Guardar el DataFrame en un archivo CSV
submission.to_csv(ruta_archivo, index=False)

In [None]:
chequeador(submission)

You're ready to submit!


NameError: name 'Image' is not defined