# **Laboratorio 8**

- Derek Arreaga - 22537
- Mónica Salvatierra - 22249

Link del repo: https://github.com/alee2602/LAB8-DS

#### **Importación de librerías**

In [None]:
import warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    precision_score, recall_score, confusion_matrix,
    roc_curve, precision_recall_curve
)

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


#### **Cargar CoverType**

In [2]:
cov = fetch_covtype(as_frame=True)
X_full: pd.DataFrame = cov.data.copy()
y_full: pd.Series = cov.target.copy()

#### **Exploración breve de features**

In [None]:

print("Número total de columnas:", X_full.shape[1])
print("\nPrimeras 10 columnas:")
print(X_full.columns[:10].tolist())

print("\nÚltimas 10 columnas:")
print(X_full.columns[-10:].tolist())

X_full.info()

X_full.head()


Número total de columnas: 54

Primeras 10 columnas:
['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']

Últimas 10 columnas:
['Soil_Type_30', 'Soil_Type_31', 'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34', 'Soil_Type_35', 'Soil_Type_36', 'Soil_Type_37', 'Soil_Type_38', 'Soil_Type_39']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 54 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Elevation                           581012 non-null  float64
 1   Aspect                              581012 non-null  float64
 2   Slope                               581012 non-null  float64
 3   Horizontal_Distance_To_Hydrology    581012 non-null  float64
 4   Vertical_Distance_To_Hydr

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### **Distribución de la variable objetivo**

In [6]:

y_full.value_counts().sort_index()


Cover_Type
1    211840
2    283301
3     35754
4      2747
5      9493
6     17367
7     20510
Name: count, dtype: int64

#### **Descripción de las variables del conjunto de datos**

#### **Etiquetar datos normales vs anómalos**

In [7]:
y = (y_full == 2).astype(int).values


#### **Definir columnas numéricas y binarias**

In [8]:
numeric_cols = [
    "Elevation", "Aspect", "Slope",
    "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
]
binary_cols = [c for c in X_full.columns if c not in numeric_cols]

print("Numéricas:", len(numeric_cols))
print("Binarias:", len(binary_cols))

Numéricas: 10
Binarias: 44


#### **Escalar variables numéricas**

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("bin", "passthrough", binary_cols),
    ],
    remainder="drop"
)

X = ct.fit_transform(X_full).astype("float32")

print("X shape:", X.shape)
print("Proporción de normales (y==1):", y.mean())

X shape: (581012, 54)
Proporción de normales (y==1): 0.48759922342395684


#### **Separación de conjuntos de entrenamiento, prueba y validción**

In [11]:

import numpy as np

SEED = 16  

# Índices por clase
idx_norm = np.where(y == 1)[0]
idx_anom = np.where(y == 0)[0]

# Train / Val / Hold (70% / 15% / 15% aprox.)
idx_train_norm, idx_tmp_norm = train_test_split(
    idx_norm, test_size=0.30, random_state=SEED, shuffle=True
)
idx_val_norm, idx_hold_norm = train_test_split(
    idx_tmp_norm, test_size=0.50, random_state=SEED, shuffle=True
)  

idx_anom_tune, idx_anom_test = train_test_split(
    idx_anom, test_size=0.30, random_state=SEED, shuffle=True
)

# Construcción de conjuntos
X_train = X[idx_train_norm]
y_train = y[idx_train_norm]           
X_val   = X[idx_val_norm]
y_val   = y[idx_val_norm]             


X_tune = np.vstack([X[idx_hold_norm], X[idx_anom_tune]])
y_tune = np.concatenate([y[idx_hold_norm], y[idx_anom_tune]])

idx_norm_rest = np.setdiff1d(idx_norm, np.union1d(idx_train_norm, np.union1d(idx_val_norm, idx_hold_norm)))
X_test = np.vstack([X[idx_norm_rest], X[idx_anom_test]])
y_test = np.concatenate([y[idx_norm_rest], y[idx_anom_test]])

print("Shapes:")
print("  X_train:", X_train.shape, " y_train sum:", y_train.sum(), )
print("  X_val  :", X_val.shape,   " y_val sum:",   y_val.sum(),  )
print("  X_tune :", X_tune.shape,  " proporción normales:", y_tune.mean())
print("  X_test :", X_test.shape,  " proporción normales:", y_test.mean())


Shapes:
  X_train: (198310, 54)  y_train sum: 198310
  X_val  : (42495, 54)  y_val sum: 42495
  X_tune : (250893, 54)  proporción normales: 0.16937897828954973
  X_test : (89314, 54)  proporción normales: 0.0


Ojo! Para los Autocodificadores es recomendable que los conjuntos de
entrenamiento y validación se realicen sólo con observaciones normales. Para el
conjunto de prueba sí se debe incluir de todo. Investiguen e indiquen el porqué
de esto.