In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_wine, load_digits
import warnings
warnings.filterwarnings('ignore')
print("Entorno configurado para trabajo con datasets originales")

Entorno configurado para trabajo con datasets originales


# 1.3 Trabajo con Datasets en Colab (Originales)
Cargamos y exploramos datasets obligatorios usando fuentes originales. Para Boston, usamos la URL cruda (debido a deprecación en sklearn por problemas éticos).

In [None]:
# Opción 1: Datasets integrados de Seaborn
titanic = sns.load_dataset('titanic')
iris = sns.load_dataset('iris')

print("Titanic shape:", titanic.shape)
print(titanic.head())

print("\nIris shape:", iris.shape)
print(iris.head())

Titanic shape: (891, 15)
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Iris shape: (150, 5)
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4  

## Titanic: Análisis de supervivencia (Original)
- Shape: (891, 15) – Pasajeros con features como edad, clase, género.
- Uso: Predicción de 'survived' (clasificación binaria).
- Fuente: Seaborn (basado en datos históricos del Titanic).

In [None]:
print("Estadísticas Titanic:\n", titanic.describe())
print("\nSupervivencia por clase:\n", titanic.groupby('class')['survived'].mean())

Estadísticas Titanic:
          survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200

Supervivencia por clase:
 class
First     0.629630
Second    0.472826
Third     0.242363
Name: survived, dtype: float64


## Iris: Clasificación multiclase (Original)
- Shape: (150, 5) – Flores con medidas de sépalos/pétalos y especies.
- Uso: Predicción de 'species' (3 clases).
- Fuente: Seaborn (dataset clásico de Fisher, 1936).

In [None]:
print("Estadísticas Iris:\n", iris.describe())
print("\nDistribución de especies:\n", iris['species'].value_counts())

Estadísticas Iris:
        sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

Distribución de especies:
 species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64


In [None]:
# Carga del dataset Boston original (deprecado en sklearn por bias ético en variable 'B')
# Fuente: http://lib.stat.cmu.edu/datasets/boston (Harrison & Rubinfeld, 1978)
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

# Procesar como indica sklearn: combinar columnas pares/impares
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# Nombres de features originales
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
boston_df = pd.DataFrame(data, columns=feature_names)
boston_df['PRICE'] = target  # Target: precio mediano de casas (en miles)

print("Boston original shape:", boston_df.shape)
print(boston_df.head())

Boston original shape: (506, 14)
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  PRICE  
0     15.3  396.90   4.98   24.0  
1     17.8  396.90   9.14   21.6  
2     17.8  392.83   4.03   34.7  
3     18.7  394.63   2.94   33.4  
4     18.7  396.90   5.33   36.2  


## Boston Housing (Original): Regresión
- Shape: (506, 14) – 13 features socioeconómicos (incluyendo 'B' con bias racial) + 'PRICE'.
- Uso: Regresión para predecir precios de casas en Boston (1970s).
- Fuente: Carnegie Mellon University StatLib.

In [None]:
print("Estadísticas Boston:\n", boston_df.describe())
print("\nCorrelación con precio:\n", boston_df.corr()['PRICE'].sort_values(ascending=False))


Estadísticas Boston:
              CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695    6.284634   
std      8.601545   23.322453    6.860353    0.253994    0.115878    0.702617   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.082045    0.000000    5.190000    0.000000    0.449000    5.885500   
50%      0.256510    0.000000    9.690000    0.000000    0.538000    6.208500   
75%      3.677083   12.500000   18.100000    0.000000    0.624000    6.623500   
max     88.976200  100.000000   27.740000    1.000000    0.871000    8.780000   

              AGE         DIS         RAD         TAX     PTRATIO           B  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean    68.574901    3.795043    9.549407  408.237154   18.455534  356.674032   
std  

In [None]:
# Dataset de Sklearn (original de UCI ML Repository)
wine = load_wine(as_frame=True)  # Devuelve DataFrame directamente
wine_df = wine.frame
print("Wine shape:", wine_df.shape)
print(wine_df.head())

Wine shape: (178, 14)
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline

## Wine: Clasificación de vinos (Original)
- Shape: (178, 14) – 13 features químicas (alcohol, acidez, etc.) + target (3 clases de cultivares italianos).
- Uso: Clasificación multiclase.
- Fuente: UCI ML Repository (Forina, 1991).

In [None]:
print("Estadísticas Wine:\n", wine_df.describe())
print("\nDistribución de clases:\n", wine_df['target'].value_counts())

Estadísticas Wine:
           alcohol  malic_acid         ash  alcalinity_of_ash   magnesium  \
count  178.000000  178.000000  178.000000         178.000000  178.000000   
mean    13.000618    2.336348    2.366517          19.494944   99.741573   
std      0.811827    1.117146    0.274344           3.339564   14.282484   
min     11.030000    0.740000    1.360000          10.600000   70.000000   
25%     12.362500    1.602500    2.210000          17.200000   88.000000   
50%     13.050000    1.865000    2.360000          19.500000   98.000000   
75%     13.677500    3.082500    2.557500          21.500000  107.000000   
max     14.830000    5.800000    3.230000          30.000000  162.000000   

       total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
count     178.000000  178.000000            178.000000       178.000000   
mean        2.295112    2.029270              0.361854         1.590899   
std         0.625851    0.998859              0.124453         0.57235

In [None]:
# Dataset de Sklearn (imágenes 8x8 en array, convertimos a DF)
digits = load_digits()
digits_df = pd.DataFrame(digits.data)
digits_df['target'] = digits.target
print("Digits shape:", digits_df.shape)
print(digits_df.head())

Digits shape: (1797, 65)
     0    1    2     3     4     5    6    7    8    9  ...   55   56   57  \
0  0.0  0.0  5.0  13.0   9.0   1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1  0.0  0.0  0.0  12.0  13.0   5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2  0.0  0.0  0.0   4.0  15.0  12.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3  0.0  0.0  7.0  15.0  13.0   1.0  0.0  0.0  0.0  8.0  ...  0.0  0.0  0.0   
4  0.0  0.0  0.0   1.0  11.0   0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

    58    59    60    61   62   63  target  
0  6.0  13.0  10.0   0.0  0.0  0.0       0  
1  0.0  11.0  16.0  10.0  0.0  0.0       1  
2  0.0   3.0  11.0  16.0  9.0  0.0       2  
3  7.0  13.0  13.0   9.0  0.0  0.0       3  
4  0.0   2.0  16.0   4.0  0.0  0.0       4  

[5 rows x 65 columns]


## Digits: Reconocimiento de dígitos (Original)
- Shape: (1797, 65) – 64 features (pixels de imágenes 8x8) + target (dígitos 0-9).
- Uso: Clasificación de imágenes (ideal para MNIST-like).
- Fuente: UCI ML Repository (Alpaydin & Kaynak, 1998).

In [None]:
print("Estadísticas Digits (primeras features):\n", digits_df.iloc[:, :8].describe())
print("\nDistribución de dígitos:\n", digits_df['target'].value_counts().sort_index())

Estadísticas Digits (primeras features):
             0            1            2            3            4  \
count  1797.0  1797.000000  1797.000000  1797.000000  1797.000000   
mean      0.0     0.303840     5.204786    11.835838    11.848080   
std       0.0     0.907192     4.754826     4.248842     4.287388   
min       0.0     0.000000     0.000000     0.000000     0.000000   
25%       0.0     0.000000     1.000000    10.000000    10.000000   
50%       0.0     0.000000     4.000000    13.000000    13.000000   
75%       0.0     0.000000     9.000000    15.000000    15.000000   
max       0.0     8.000000    16.000000    16.000000    16.000000   

                 5            6            7  
count  1797.000000  1797.000000  1797.000000  
mean      5.781859     1.362270     0.129661  
std       5.666418     3.325775     1.037383  
min       0.000000     0.000000     0.000000  
25%       0.000000     0.000000     0.000000  
50%       4.000000     0.000000     0.000000  
75%    

In [None]:
# Tabla resumen
resumen = pd.DataFrame({
    'Dataset': ['Titanic', 'Iris', 'Boston (Original)', 'Wine', 'Digits'],
    'Tarea': ['Clasificación Binaria', 'Clasificación Multiclase', 'Regresión', 'Clasificación Multiclase', 'Clasificación Imágenes'],
    'Shape': [titanic.shape, iris.shape, boston_df.shape, wine_df.shape, digits_df.shape],
    'Features': [len(titanic.columns)-1, len(iris.columns)-1, len(boston_df.columns)-1, len(wine_df.columns)-1, len(digits_df.columns)-1]
})
print(resumen)

             Dataset                     Tarea       Shape  Features
0            Titanic     Clasificación Binaria   (891, 15)        14
1               Iris  Clasificación Multiclase    (150, 5)         4
2  Boston (Original)                 Regresión   (506, 14)        13
3               Wine  Clasificación Multiclase   (178, 14)        13
4             Digits    Clasificación Imágenes  (1797, 65)        64
