In [None]:
# Librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from google.colab import files

sns.set(style='whitegrid')

In [None]:
# Subir archivo
uploaded = files.upload()

# Cargar CSV en DataFrame
df = pd.read_csv(list(uploaded.keys())[0])
df.head()

In [None]:
print('Filas, Columnas:', df.shape)
print('\nTipos de variables:\n', df.dtypes)
print('\nValores nulos:\n', df.isnull().sum())

In [None]:
categorical_cols = ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']
for col in categorical_cols:
    df[col] = df[col].astype('category')
df.dtypes

In [None]:
# Estadísticas descriptivas
display(df.describe(include='all'))

# Histogramas de variables numéricas
num_cols = df.select_dtypes(include=np.number).columns.tolist()
df[num_cols].hist(figsize=(12,8), bins=15);

In [None]:
def remove_outliers_iqr(df, col_list):
    for col in col_list:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5*IQR
        upper = Q3 + 1.5*IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

numeric_cols_for_outliers = ['Heart Rate', 'Sleep Duration', 'Physical Activity Level', 'Daily Steps']
df = remove_outliers_iqr(df, numeric_cols_for_outliers)
df.shape

In [None]:
# Eliminar filas con Stress Level < 3
df = df[df['Stress Level'] >= 3]

# Crear variable binaria
df['stress_binary'] = df['Stress Level'].apply(lambda x: 1 if x >= 7 else 0)

# Eliminar columna original
df = df.drop(columns=['Stress Level'])

# Distribución final
df['stress_binary'].value_counts()

In [None]:
# Sleep Duration vs Estrés
plt.figure(figsize=(8,5))
sns.boxplot(x='stress_binary', y='Sleep Duration', data=df)
plt.title('Sleep Duration vs Stress Binary')
plt.show()

# Physical Activity Level vs Estrés
plt.figure(figsize=(8,5))
sns.boxplot(x='stress_binary', y='Physical Activity Level', data=df)
plt.title('Physical Activity Level vs Stress Binary')
plt.show()

# Sleep Disorder vs Estrés
plt.figure(figsize=(8,5))
sns.countplot(x='Sleep Disorder', hue='stress_binary', data=df)
plt.title('Sleep Disorder vs Stress Binary')
plt.show()

In [None]:
# Exclude non-numeric columns for correlation matrix
numeric_df = df.select_dtypes(include=np.number)
corr = numeric_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Matriz de Correlación')
plt.show()

In [None]:
train, test = train_test_split(df, test_size=0.2, stratify=df['stress_binary'], random_state=42)

# Guardar CSV
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

print('Train:', train.shape, 'Test:', test.shape)

In [None]:
# Columnas finales
df.columns