# Exploratory Data Analysis

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.model_selection import train_test_split
import pickle
import os

In [22]:
pd.set_option('display.max_columns', None)
current_dir = os.getcwd()

### Exploración y limpieza de datos

In [23]:
url_data = os.path.join(current_dir, "../data/raw/Dataframe_Final_Data.csv")
data = pd.read_csv(url_data, sep = ',')
data.head()

Unnamed: 0,CategoryID,CategoryName,ProductName,Price,Class,ModifyDate,Resistant,IsAllergic,VitalityDays,SalesID,CustomerID,Quantity,ProductID,Discount,Date,TransactionNumber,FirstName,MiddleInitial,LastName,CityID,Address,CityName,Zipcode,CountryID,CountryName,CountryCode,State,pib,rpc,wti,Unemployment_Rate,Population_2018,personal_income,Crecimiento_poblacional,Total_price
0,1,Confections,"Spoon - Soup, Plastic",32.442,Low,2017-03-03 09:47:09.310,Weak,True,0.0,12630,24490,7,15,0.0,2018-01-01,5BSK7H5X44DGRUWEKJEA,Jill,P,Soto,14,31 New Parkway,Indianapolis,81678,32,United States,AR,Indiana,122538900000000.0,15349670000.0,4.628095,3.8,6695497.0,3403557.0,0.56,227.09
1,1,Confections,"Spoon - Soup, Plastic",32.442,Low,2017-03-03 09:47:09.310,Weak,True,0.0,115884,95026,25,15,0.0,2018-01-01,3Q0RRIMLEEIMZ4U2G347,Pamela,Z,Estrada,4,949 Milton Drive,Fremont,20641,32,United States,AR,California,936472600000000.0,19911670000.0,4.628095,4.8,39461588.0,41569920.0,0.26,811.05
2,1,Confections,"Spoon - Soup, Plastic",32.442,Low,2017-03-03 09:47:09.310,Weak,True,0.0,217388,27676,8,15,0.0,2018-01-01,T2ZK8X0HU1KWKPRQ5MUQ,Anita,B,Sanchez,10,30 West Milton Way,Toledo,52048,32,United States,AR,Ohio,217512800000000.0,15704000000.0,4.628095,5.2,11676341.0,6392306.0,0.14,259.54
3,1,Confections,"Spoon - Soup, Plastic",32.442,Low,2017-03-03 09:47:09.310,Weak,True,0.0,364759,11630,3,15,0.2,2018-01-01,ILPQKU2EBTVNMTN7FQNL,Dustin,B,Coffey,40,904 Oak Parkway,Cleveland,51352,32,United States,AR,Ohio,217512800000000.0,15704000000.0,4.628095,5.2,11676341.0,6392306.0,0.14,77.86
4,1,Confections,"Spoon - Soup, Plastic",32.442,Low,2017-03-03 09:47:09.310,Weak,True,0.0,447481,83733,22,15,0.2,2018-01-01,TI5RNV3CZM39NW16FG9M,Bridgette,X,Valenzuela,44,52 Rocky Second Drive,Memphis,73999,32,United States,AR,Tennessee,120389900000000.0,15244000000.0,4.628095,3.8,6771631.0,3328174.0,0.94,570.98


In [24]:
# Obtener las dimensiones.
data.shape

(6690599, 35)

In [25]:
# Obtener información sobre tipos de datos y valores no nulos.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6690599 entries, 0 to 6690598
Data columns (total 35 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   CategoryID               int64  
 1   CategoryName             object 
 2   ProductName              object 
 3   Price                    float64
 4   Class                    object 
 5   ModifyDate               object 
 6   Resistant                object 
 7   IsAllergic               object 
 8   VitalityDays             float64
 9   SalesID                  int64  
 10  CustomerID               int64  
 11  Quantity                 int64  
 12  ProductID                int64  
 13  Discount                 float64
 14  Date                     object 
 15  TransactionNumber        object 
 16  FirstName                object 
 17  MiddleInitial            object 
 18  LastName                 object 
 19  CityID                   int64  
 20  Address                  object 
 21  CityName

#### Eliminar duplicados

In [26]:
# Verificar si hay duplicados y eliminarlos si los hubiese.
data.duplicated().sum()

np.int64(0)

En este caso no encontramos duplicados en el conjunto de datos.

#### Eliminar columnas sin relevancia

In [27]:
data.drop(columns = ['TransactionNumber', 'CategoryName', 'SalesID', 'CountryCode', 'CustomerID', 'ProductID', 'CityID', 'CountryID', 'FirstName', 'MiddleInitial', 'LastName', 'Address', 'Zipcode'], inplace=True)

print(data.columns)

data.head()

Index(['CategoryID', 'ProductName', 'Price', 'Class', 'ModifyDate',
       'Resistant', 'IsAllergic', 'VitalityDays', 'Quantity', 'Discount',
       'Date', 'CityName', 'CountryName', 'State', 'pib', 'rpc', 'wti',
       'Unemployment_Rate', 'Population_2018', 'personal_income',
       'Crecimiento_poblacional', 'Total_price'],
      dtype='object')


Unnamed: 0,CategoryID,ProductName,Price,Class,ModifyDate,Resistant,IsAllergic,VitalityDays,Quantity,Discount,Date,CityName,CountryName,State,pib,rpc,wti,Unemployment_Rate,Population_2018,personal_income,Crecimiento_poblacional,Total_price
0,1,"Spoon - Soup, Plastic",32.442,Low,2017-03-03 09:47:09.310,Weak,True,0.0,7,0.0,2018-01-01,Indianapolis,United States,Indiana,122538900000000.0,15349670000.0,4.628095,3.8,6695497.0,3403557.0,0.56,227.09
1,1,"Spoon - Soup, Plastic",32.442,Low,2017-03-03 09:47:09.310,Weak,True,0.0,25,0.0,2018-01-01,Fremont,United States,California,936472600000000.0,19911670000.0,4.628095,4.8,39461588.0,41569920.0,0.26,811.05
2,1,"Spoon - Soup, Plastic",32.442,Low,2017-03-03 09:47:09.310,Weak,True,0.0,8,0.0,2018-01-01,Toledo,United States,Ohio,217512800000000.0,15704000000.0,4.628095,5.2,11676341.0,6392306.0,0.14,259.54
3,1,"Spoon - Soup, Plastic",32.442,Low,2017-03-03 09:47:09.310,Weak,True,0.0,3,0.2,2018-01-01,Cleveland,United States,Ohio,217512800000000.0,15704000000.0,4.628095,5.2,11676341.0,6392306.0,0.14,77.86
4,1,"Spoon - Soup, Plastic",32.442,Low,2017-03-03 09:47:09.310,Weak,True,0.0,22,0.2,2018-01-01,Memphis,United States,Tennessee,120389900000000.0,15244000000.0,4.628095,3.8,6771631.0,3328174.0,0.94,570.98


#### Columnas que DESCARTAR y por qué:

`TransactionNumber`: 
Es simplemente un identificador único de cada transacción, no aporta información útil para predecir demanda.


`SalesID`, `CustomerID`, `ProductID`, `CityID`, `CountryID`:
Son IDs arbitrarios que no tienen significado en sí mismos y pueden confundir al modelo.

`CategoryName`, `CountryCode`: Tenemos la columna CategoryID  y Country NaME la cual serian estas mismas columna ya categorizadas.


`FirstName`, `MiddleInitial`, `LastName`:
Son nombres personales; no tienen valor predictivo y pueden introducir sesgo o problemas de privacidad.


`Address`, `Zipcode`:
Pueden contener información útil si se convierten en variables regionales o socioeconómicas, pero en bruto son de muy alta cardinalidad y difíciles de usar directamente. Mejor descartarlas por ahora, a menos que tengamos una forma clara de agruparlas.

#### Columnas que pueden ser de utilidad pero que hay que modificar:

`ModifyDate`, `Date`:

En su forma cruda no son útiles, pero podemos extraer de ellas variables como:
Día de la semana
Mes
¿Es fin de semana o no?
¿Es feriado o no?
Entonces: descartarlas como strings, pero sacar variables derivadas antes.

`ProductName`, `CountryName`, `CityName`, `State`:

Considerar agruparlas o usar solo las más frecuentes. (Factorizarlas)

#### Columnas ÚTILES para predecir la demanda:

`CategoryID`, `Class` → tipo de producto.

`Resistant`, `IsAllergic`, `VitalityDays` → características del producto.

`Price`, `Discount`, `Total_price` → precio y descuentos.

`pib`, `rpc` (renta per cápita), `wti` (petróleo), `Unemployment_Rate`, `Population_2018`, `personal_income`, `Crecimiento_poblacional` → variables macroeconómicas que podrían correlacionar con la demanda.

Variables derivadas del tiempo (`Date`, `ModifyDate`).

### Paso 3: Análisis de variables univariante

#### Análisis sobre variables categóricas

#### Análisis sobre variables numéricas

In [None]:
numericals_variables = data_limpia.select_dtypes(include = ['number']).drop(columns=['Outcome']).corr() # Con esta linea seleccionamos las columnas numericas del dataframe.

def plot_numericas(data_set, variables_numericas):
    
    # Crear la figura con 1 columnas y 2 filas por variable.
    fig, axis = plt.subplots(len(variables_numericas) * 2, 1, figsize=(8, len(variables_numericas) * 7))

    # Definir límites de los ejes x para cada par de gráficos (histograma y boxplot).
    # RECORDAR CAMBIAR LOS NOMBRES Y PONER LOS DE LAS COLUMNAS DE DF CORRESPONDIENTE.
    x_limits = {
        'age': (0, 100),   # Rango para el histograma y el boxplot de columna1...
        'duration': (0, 2000),    
        'campaign': (0, 20), 
        'pdays': (0, 2000),
           
        # Añadir más columnas y rangos si es necesario...
    }

    # Iterar sobre cada columna del DataFrame
    for i, col in enumerate(variables_numericas):
        index = i * 2
        # Histograma en la primera fila
        sns.histplot(data = data_set, x = col, kde = True, ax = axis[index])
        axis[index].set_title(f'Histogram of {col}')
        
        # Establecer límites del eje x para el histograma
        if col in x_limits:
            axis[index].set_xlim(x_limits[col])  # Asigna el rango de valores personalizado al histograma

        # Boxplot en la segunda fila
        sns.boxplot(data = data_set, x = col, ax = axis[index + 1])
        axis[index + 1].set_title(f'Boxplot of {col}')
        
        # Establecer límites del eje x para el boxplot (mismo rango que el histograma)
        if col in x_limits:
            axis[index + 1].set_xlim(x_limits[col])  # Asigna el mismo rango de valores al boxplot

    plt.tight_layout()
    plt.show()

plot_numericas(data_limpia, numericals_variables)

### Paso 4: Análisis de variables multivariante

#### Análisis numérico-numérico

In [None]:

numericals_variables = data_limpia.select_dtypes(include = ['number']).drop(columns=['Outcome']).corr() # Con esta linea seleccionamos las columnas numericas del dataframe.

  
def plot_numerico_numerico(data_set, variables_numericas):

    target = 'Outcome' # Recordar cambiar el target.     
    
    # Crear una figura con 1 columna y 2 filas por cada variable
    fig, axis = plt.subplots(len(variables_numericas) * 2, 1, figsize=(8, (len(variables_numericas) * 5)))

    # Crear un diagrama de dispersión múltiple                
    for i, col in enumerate(variables_numericas):

        # Regplot en la primera fila (fila 2 * i)
        sns.regplot(ax = axis[i * 2], data = data_set, x = col, y = target)
        axis[i * 2].set_title(f'Regplot of {col} vs {target}')
        
        # Heatmap en la segunda fila.
        sns.heatmap(data_set[[col, target]].corr(), annot = True, fmt = ".2f", ax = axis[i * 2 + 1], cbar = True)
        axis[i * 2 + 1].set_title(f'Correlation Heatmap of {col} vs {target}')

    plt.tight_layout()
    plt.show()

plot_numerico_numerico(data_limpia, numericals_variables)

data_limpia.head()

**Análisis de posibles relaciones entre variables numericas**

#### Análisis categórico-categórico

#### Combinaciones de la clase con varias predictoras

In [None]:
fig, axis = plt.subplots(figsize = (10, 5), ncols = 2)

sns.barplot(ax = axis[0], data = data_limpia, x = "Outcome", y = 'Glucose', hue = 'BMI')

sns.barplot(ax = axis[1], data = data_limpia, x = "Outcome", y = 'BMI', hue = 'BloodPressure').set(ylabel = None)
for tick in axis[1].get_xticklabels():
    tick.set_rotation(90)

plt.tight_layout()
plt.show()

##### Análisis de correlaciones. 

In [None]:
def plot_matriz_correlacion(data_set):

    corr_matrix = data_set.select_dtypes(include = ['number']).corr() # Con esta linea seleccionamos las columnas numericas del dataframe.

    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot = True, fmt = ".2f", linewidths = 0.5, cmap = "coolwarm")

    plt.tight_layout()
    plt.show()

plot_matriz_correlacion(data_limpia)


#### Una vez analizada la correlación, analicemos los dos casos vistos para corroborar la teoría:

In [None]:
# Una vez analizada la correlación, analicemos los dos casos vistos para corroborar la teoría:

fig, axis = plt.subplots(figsize = (10, 5), ncols = 2)

sns.regplot(ax = axis[0], data = data_limpia, x = "Outcome", y = "Glucose", scatter_kws={'edgecolor': 'k', 'alpha': 0.6})
sns.regplot(ax = axis[1], data = data_limpia, x = "Age", y = "Pregnancies", scatter_kws={'edgecolor': 'k', 'alpha': 0.6})
axis[0].grid(linestyle='--', alpha=0.7)
axis[1].grid(linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

#### Pairpolot

In [None]:
# graficar el pairplot.

sns.pairplot(data = data_limpia)

plt.tight_layout()
plt.show()

### Paso 5: Ingeniería de características

### Valores faltantes

In [None]:
total_data["Embarked"].fillna(total_data["Embarked"].mode()[0], inplace = True) # RECORDANDO QUE MODE PARA LAS VARIABLES CATEGORICAS

total_data["Fare"].fillna(total_data["Fare"].mean(), inplace = True) # Y MEDIA PARA LAS VARIABLES NUMERICAS

total_data.isnull().sum()

#### Inferencia de nuevas características

#### Escalado de valores

In [None]:
# Escalado de valores:

# Separar 'x_con_outliers' y 'x_sin_outliers' e 'Y' en train y test. (resultante 6 excels). 

num_variables = data_limpia.select_dtypes(include = 'number').drop(columns = ['Outcome'], errors='ignore').columns # Si quiero eliminar alguna columna rellenar la parte de drop.

# Dividimos el conjunto de datos en muestras de train y test
X_con_outliers = data_limpia_con_outliers.drop("Outcome", axis = 1)[num_variables]
X_sin_outliers = data_limpia_sin_outliers.drop("Outcome", axis = 1)[num_variables]
y = data_limpia_con_outliers["Outcome"]

X_train_con_outliers, X_test_con_outliers, y_train, y_test = train_test_split(X_con_outliers, y, test_size = 0.2, random_state = 42)
X_train_sin_outliers, X_test_sin_outliers = train_test_split(X_sin_outliers, test_size = 0.2, random_state = 42)

# GUARDAR LOS DATASETS
X_train_con_outliers.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_train_con_outliers.xlsx", index = False)
X_train_sin_outliers.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_train_sin_outliers.xlsx", index = False)
X_test_con_outliers.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_test_con_outliers.xlsx", index = False)
X_test_sin_outliers.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_test_sin_outliers.xlsx", index = False)
y_train.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/y_train.xlsx", index = False)
y_test.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/y_test.xlsx", index = False)

X_train_con_outliers.head()

##### Normalización

In [None]:
# Normalización:

from sklearn.preprocessing import StandardScaler
import pickle

num_variables = data_limpia.select_dtypes(include = 'number').drop(columns = ['Outcome'], errors='ignore').columns # Si quiero eliminar alguna columna rellenar la parte de drop.

### NORMALIZAMOS EL DATAFRAME CON OUTLIERS Y LO GUARDAMOS
normalizador_con_outliers = StandardScaler()
normalizador_con_outliers.fit(X_train_con_outliers)   

with open("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Models(Norm_scal)/normalizador_con_outliers.pkl", "wb") as file: # Guardar el Normalizador. 
  pickle.dump(normalizador_con_outliers, file)

X_train_con_outliers_norm = normalizador_con_outliers.transform(X_train_con_outliers)
X_train_con_outliers_norm = pd.DataFrame(X_train_con_outliers_norm, index = X_train_con_outliers.index, columns = num_variables)

X_test_con_outliers_norm = normalizador_con_outliers.transform(X_test_con_outliers)
X_test_con_outliers_norm = pd.DataFrame(X_test_con_outliers_norm, index = X_test_con_outliers.index, columns = num_variables)

# GUARDAR LOS DATASETS
X_train_con_outliers_norm.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_train_con_outliers_norm.xlsx", index = False)
X_test_con_outliers_norm.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_test_con_outliers_norm.xlsx", index = False)
### NORMALIZAMOS EL DATAFRAME SIN OUTLIERS Y LO GUARDAMOS
normalizador_sin_outliers = StandardScaler()
normalizador_sin_outliers.fit(X_train_sin_outliers)

with open("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Models(Norm_scal)/normalizador_sin_outliers.pkl", "wb") as file: # Guardar el Normalizador. 
  pickle.dump(normalizador_sin_outliers, file)

X_train_sin_outliers_norm = normalizador_sin_outliers.transform(X_train_sin_outliers)
X_train_sin_outliers_norm = pd.DataFrame(X_train_sin_outliers_norm, index = X_train_sin_outliers.index, columns = num_variables)

X_test_sin_outliers_norm = normalizador_sin_outliers.transform(X_test_sin_outliers)
X_test_sin_outliers_norm = pd.DataFrame(X_test_sin_outliers_norm, index = X_test_sin_outliers.index, columns = num_variables)

# GUARDAR LOS DATASETS
X_train_sin_outliers_norm.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_train_sin_outliers_norm.xlsx", index = False)
X_test_sin_outliers_norm.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_test_sin_outliers_norm.xlsx", index = False)

X_train_con_outliers_norm.head()

##### Escalado Mínimo-Máximo

In [None]:
# Escalado Mínimo-Máximo:

from sklearn.preprocessing import MinMaxScaler

num_variables = data_limpia.select_dtypes(include = 'number').drop(columns = ['Outcome'], errors='ignore').columns # Si quiero eliminar alguna columna rellenar la parte de drop.

### ESCALAMOS EL DATAFRAME CON OUTLIERS Y LO GUARDAMOS
scaler_con_outliers = MinMaxScaler()
scaler_con_outliers.fit(X_train_con_outliers)

with open("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Models(Norm_scal)/scaler_con_outliers.pkl", "wb") as file: # Guardar el Escaler. 
  pickle.dump(scaler_con_outliers, file)
  
X_train_con_outliers_scal = scaler_con_outliers.transform(X_train_con_outliers)
X_train_con_outliers_scal = pd.DataFrame(X_train_con_outliers_scal, index = X_train_con_outliers.index, columns = num_variables)

X_test_con_outliers_scal = scaler_con_outliers.transform(X_test_con_outliers)
X_test_con_outliers_scal = pd.DataFrame(X_test_con_outliers_scal, index = X_test_con_outliers.index, columns = num_variables)

# GUARDAR LOS DATASETS
X_train_con_outliers_scal.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_train_con_outliers_scal.xlsx", index = False)
X_test_con_outliers_scal.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_test_con_outliers_scal.xlsx", index = False)

### ESCALAMOS EL DATAFRAME SIN OUTLIERS Y LO GUARDAMOS
scaler_sin_outliers = MinMaxScaler()
scaler_sin_outliers.fit(X_train_sin_outliers)

with open("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Models(Norm_scal)/scaler_sin_outliers.pkl", "wb") as file: # Guardar el Escaler. 
  pickle.dump(scaler_sin_outliers, file)

X_train_sin_outliers_scal = scaler_sin_outliers.transform(X_train_sin_outliers)
X_train_sin_outliers_scal = pd.DataFrame(X_train_sin_outliers_scal, index = X_train_sin_outliers.index, columns = num_variables)

X_test_sin_outliers_scal = scaler_sin_outliers.transform(X_test_sin_outliers)
X_test_sin_outliers_scal = pd.DataFrame(X_test_sin_outliers_scal, index = X_test_sin_outliers.index, columns = num_variables)

# GUARDAR LOS DATASETS
X_train_sin_outliers_scal.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_train_sin_outliers_scal.xlsx", index = False)
X_test_sin_outliers_scal.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_test_sin_outliers_scal.xlsx", index = False)

X_train_con_outliers_scal.head()

### Feature Selection

In [None]:
from sklearn.feature_selection import f_classif, SelectKBest

# Con un valor de k = 4 decimos implícitamente que queremos eliminar 1 característica1 del conjunto de datos.

selection_model = SelectKBest(f_classif, k = 7)
selection_model.fit(X_train_con_outliers, y_train)

ix = selection_model.get_support()
X_train_sel = pd.DataFrame(selection_model.transform(X_train_con_outliers), columns = X_train_con_outliers.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test_con_outliers), columns = X_test_con_outliers.columns.values[ix])

# GUARDO X_train_sel.columns

columns_list = X_train_sel.columns.tolist() # Convierte el objeto Index a una lista.tolist()

with open("feature_selection_k_7.json", "w") as f:
  json.dump(columns_list, f)

X_train_sel.head()

# GUARDAR LOS DATASETS
X_train_sel.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_train_sel_k7.xlsx", index = False)
X_test_sel.to_excel("/Users/julian/Desktop/vs code/STREAMLIT_Project_26_Julian_Lopez/ML-WEBAPP-USING-STREAMLIT_Project_26_Julian_Lopez/data/Excels/X_test_sel_k7.xlsx", index = False)