In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import urllib.request

In [None]:
# Carga de datos

In [4]:
url = '../Data/train.csv'
url2 = '../Data/test.csv'
train_df = pd.read_csv(url)
test = pd.read_csv(url2)

In [5]:
def analisis_data(df):
    print("Dimensiones del DataFrame:", df.shape)
    print("\nInformación del DataFrame:")
    print(df.info())
    print("\nEstadísticas descriptivas:")
    print(df.describe())
    print("\nValores nulos por columna:")
    print(df.isnull().sum())

In [8]:
def preprocessado(df):
    df = df.copy()
    
    df['Ram'] = df['Ram'].str.extract('(\d+)').astype(int)
    df['Weight'] = df['Weight'].str.extract('([\d.]+)').astype(float)
    df['Memory_Primary'] = df['Memory'].str.extract('(\d+)').astype(float)
    df['Memory_Type'] = df['Memory'].str.extract('(SSD|HDD)').fillna('Other')
    df['OpSys'] = df['OpSys'].apply(lambda x: 'Windows' if 'Windows' in str(x)
                                   else 'Mac' if 'Mac' in str(x)
                                   else 'Linux' if 'Linux' in str(x)
                                   else 'Other')
    df['Resolution'] = df['ScreenResolution'].str.extract('(\d+x\d+)').fillna('unknown')
    df['TouchScreen'] = df['ScreenResolution'].str.contains('Touch').astype(int)
    return df

In [9]:
def creacion(df):
    df = df.copy()
    top_companies = ['Dell', 'HP', 'Lenovo', 'Asus', 'Acer']
    df['Company_Tier'] = df['Company'].apply(lambda x: x if x in top_companies else 'Other')
    df['Screen_Weight_Ratio'] = df['Inches'] / df['Weight']
    df['Cpu_Brand'] = df['Cpu'].apply(lambda x: 'Intel' if 'Intel' in x 
                                     else 'AMD' if 'AMD' in x 
                                     else 'Other')
    
    return df

In [10]:
def sample(df, sample_size=0.7, random_state=42):
    # Crear bins de precios para estratificación
    df['Price_Bin'] = pd.qcut(df['Price_euros'], q=5, labels=False)
    
    # Realizar el muestreo estratificado
    sample_df, remaining_df = train_test_split(
        df,
        train_size=sample_size,
        stratify=df['Price_Bin'],
        random_state=random_state
    )
    
    # Eliminar la columna auxiliar de bins
    sample_df = sample_df.drop('Price_Bin', axis=1)
    remaining_df = remaining_df.drop('Price_Bin', axis=1)
    
    return sample_df, remaining_df

In [11]:
def main():
  
    # Análisis inicial
    print("=== Análisis del conjunto de entrenamiento ===")
    analisis_data(train_df)
    
    # Preprocesamiento
    train_processed = preprocessado(train_df)
    train_processed = creacion(train_processed)
    
    # Crear muestra estratificada
    sample_df, remaining_df = sample(train_processed)
    
    # Mostrar información de la muestra
    print("\n=== Información de la muestra estratificada ===")
    print("Tamaño de la muestra:", len(sample_df))
    print("Tamaño del conjunto restante:", len(remaining_df))
    sample_df.to_csv('train_sample.csv', index=False)
    remaining_df.to_csv('train_validation.csv', index=False)
    
    return sample_df, remaining_df

if __name__ == "__main__":
    sample_df, remaining_df = main()

=== Análisis del conjunto de entrenamiento ===
Dimensiones del DataFrame: (912, 13)

Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         912 non-null    int64  
 1   Company           912 non-null    object 
 2   Product           912 non-null    object 
 3   TypeName          912 non-null    object 
 4   Inches            912 non-null    float64
 5   ScreenResolution  912 non-null    object 
 6   Cpu               912 non-null    object 
 7   Ram               912 non-null    object 
 8   Memory            912 non-null    object 
 9   Gpu               912 non-null    object 
 10  OpSys             912 non-null    object 
 11  Weight            912 non-null    object 
 12  Price_euros       912 non-null    float64
dtypes: float64(2), int64(1), object(10)
memory usage: 92.8+ KB
None

Estadíst