In [11]:
# 1. Librerías
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# 2. Data
url = '../Data/train.csv'
url2 = '../Data/test.csv'
train = pd.read_csv(url)
test = pd.read_csv(url2)

# 3. Data exploration
def explore_data(df):
    print("Dimensiones:", df.shape)
    print("\nInformación del DataFrame:")
    print(df.info())
    print("\nEstadísticas descriptivas:")
    print(df.describe())
    print("\nValores nulos:")
    print(df.isnull().sum())

# Explorar datos de entrenamiento
explore_data(train)

# 4. Data processing
def process_features(df):
    df = df.copy()
    
    # Procesar RAM (convertir a GB)
    df['Ram'] = df['Ram'].str.extract('(\d+)').astype(float)
    
    # Procesar Weight (convertir a kg)
    df['Weight'] = df['Weight'].str.extract('([\d.]+)').astype(float)
    
    # Procesar Memory
    df['Memory_GB'] = df['Memory'].str.extract('(\d+)').astype(float)
    df['Has_SSD'] = df['Memory'].str.contains('SSD').astype(int)
    df['Has_HDD'] = df['Memory'].str.contains('HDD').astype(int)
    
    # Procesar ScreenResolution
    df['Resolution'] = df['ScreenResolution'].str.extract('(\d+x\d+)')
    df['TouchScreen'] = df['ScreenResolution'].str.contains('Touch').astype(int)
    df['IPS'] = df['ScreenResolution'].str.contains('IPS').astype(int)
    
    # Extraer información de CPU
    df['Cpu_Speed'] = df['Cpu'].str.extract('([\d.]+)GHz').astype(float)
    df['Cpu_Brand'] = df['Cpu'].apply(lambda x: 'Intel' if 'Intel' in x else 'AMD' if 'AMD' in x else 'Other')
    
    # Simplificar OpSys
    df['OpSys'] = df['OpSys'].apply(lambda x: 'Windows' if 'Windows' in x 
                                   else 'Mac' if 'Mac' in x 
                                   else 'Linux' if 'Linux' in x 
                                   else 'Other')
    
    # Crear características derivadas
    df['Screen_Area'] = df['Inches'] * df['Inches']
    df['PPI'] = df['Resolution'].apply(lambda x: calculate_ppi(x, df['Inches']) if pd.notna(x) else None)
    
    # Seleccionar características finales
    features = ['laptop_ID', 'Ram', 'Weight', 'Memory_GB', 'Has_SSD', 'Has_HDD', 
               'TouchScreen', 'IPS', 'Cpu_Speed', 'Screen_Area', 'PPI']
    
    return df[features]

def calculate_ppi(resolution, inch):
    """Calcula los píxeles por pulgada"""
    try:
        if pd.isna(resolution):
            return None
        width, height = map(float, resolution.split('x'))
        diagonal_pixels = np.sqrt(width**2 + height**2)
        return diagonal_pixels / inch
    except:
        return None

def process_features(df):
    df = df.copy()
    
    # Procesar RAM (convertir a GB)
    df['Ram'] = df['Ram'].str.extract('(\d+)').astype(float)
    
    # Procesar Weight (convertir a kg)
    df['Weight'] = df['Weight'].str.extract('([\d.]+)').astype(float)
    
    # Procesar Memory
    df['Memory_GB'] = df['Memory'].str.extract('(\d+)').astype(float)
    df['Has_SSD'] = df['Memory'].str.contains('SSD').astype(int)
    
    # Procesar ScreenResolution
    df['Resolution'] = df['ScreenResolution'].str.extract('(\d+x\d+)')
    df['TouchScreen'] = df['ScreenResolution'].str.contains('Touch').astype(int)
    
    # Calcular PPI de forma vectorizada
    resolutions = df['Resolution'].values
    inches = df['Inches'].values
    df['PPI'] = [calculate_ppi(res, inch) for res, inch in zip(resolutions, inches)]
    
    # Extraer información de CPU
    df['Cpu_Speed'] = df['Cpu'].str.extract('([\d.]+)GHz').astype(float)
    
    # Crear características derivadas
    df['Screen_Area'] = df['Inches'] * df['Inches']
    
    # Seleccionar características finales
    features = ['laptop_ID', 'Ram', 'Weight', 'Memory_GB', 'Has_SSD', 
               'TouchScreen', 'Cpu_Speed', 'Screen_Area', 'Inches', 'PPI']
    
    return df[features]

# Procesar datos
train_processed = process_features(train)
test_processed = process_features(test)

print("Datos procesados (train):")
print(train_processed.head())
print("\nDimensiones:", train_processed.shape)

# Verificar que no hay valores nulos
print("\nValores nulos después del procesamiento:")
print(train_processed.isnull().sum())

# 5. Feature Selection y preparación final
X = train_processed.drop(['laptop_ID'], axis=1)
y = train['Price_euros']

# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Escalado de características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 8. Modelo y entrenamiento
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# 9. Evaluación
train_predictions = model.predict(X_train_scaled)
test_predictions = model.predict(X_test_scaled)

print("\nMétricas de entrenamiento:")
print("RMSE:", root_mean_squared_error(y_train, train_predictions))
print("\nMétricas de prueba:")
print("RMSE:", root_mean_squared_error(y_test, test_predictions))

# 10. Predicciones para submission
X_submit = test_processed.drop(['laptop_ID'], axis=1)
X_submit_scaled = scaler.transform(X_submit)
final_predictions = model.predict(X_submit_scaled)

# 11. Crear submission
submission = pd.DataFrame({
    'laptop_ID': test_processed['laptop_ID'],
    'Price_euros': final_predictions
})

# 12. Guardar submission
submission.to_csv('submission.csv', index=False)



Dimensiones: (912, 13)

Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         912 non-null    int64  
 1   Company           912 non-null    object 
 2   Product           912 non-null    object 
 3   TypeName          912 non-null    object 
 4   Inches            912 non-null    float64
 5   ScreenResolution  912 non-null    object 
 6   Cpu               912 non-null    object 
 7   Ram               912 non-null    object 
 8   Memory            912 non-null    object 
 9   Gpu               912 non-null    object 
 10  OpSys             912 non-null    object 
 11  Weight            912 non-null    object 
 12  Price_euros       912 non-null    float64
dtypes: float64(2), int64(1), object(10)
memory usage: 92.8+ KB
None

Estadísticas descriptivas:
         laptop_ID      Inches  Price_euro