# Initialization

In [53]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
data_path = '../Dataset/Human Stress Dataset.csv'
dataset = pd.read_csv(data_path)
df = pd.DataFrame(dataset)

# Numerical and Categorical Features

In [47]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print("Numerical Columns:")
print(numerical_cols)

print("\nCategorical Columns:")
print(categorical_cols)

for col in categorical_cols:
    # print all the unique values in the column
    print("\nUnique values in column", col)
    print(df[col].unique())

Numerical Columns:
Index(['Snoring Rate', 'Respiratory Rate', 'Body Temperature', 'Limb Movement',
       'Blood Oxygen', 'Eye Movement', 'Sleep Hours', 'Heart Rate'],
      dtype='object')

Categorical Columns:
Index(['Stress Levels'], dtype='object')

Unique values in column Stress Levels
['stressed' 'not stressed']


# Handling Missing Values

In [48]:
missing_values = df.isnull().sum()

# If there are missing values, impute them
if missing_values.any():
    print("Missing values found. Imputing missing values...")
    
    # Handling missing values for numerical columns by filling with mean
    imputer = SimpleImputer(strategy='mean')
    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

    # Handling missing values for categorical columns by filling with most frequent value
    imputer_cat = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

    print("Missing values have been imputed.")
else:
    print("No missing values found in the dataset.")


No missing values found in the dataset.


df.isnull().sum(), it returns a Series where the index is the column names, and the value for each column is the number of missing values in that column.

# Encoding Categorical Variables:

In [49]:
print("Original DataFrame:")
print(df.head())

Original DataFrame:
   Snoring Rate  Respiratory Rate  Body Temperature  Limb Movement  \
0         93.80            25.680            91.840         16.600   
1         91.64            25.104            91.552         15.880   
2         60.00            20.000            96.000         10.000   
3         85.76            23.536            90.768         13.920   
4         48.12            17.248            97.872          6.496   

   Blood Oxygen  Eye Movement  Sleep Hours  Heart Rate Stress Levels  
0        89.840         99.60        1.840       74.20      stressed  
1        89.552         98.88        1.552       72.76      stressed  
2        95.000         85.00        7.000       60.00  not stressed  
3        88.768         96.92        0.768       68.84      stressed  
4        96.248         72.48        8.248       53.12  not stressed  


**Label Encoding:** This method assigns each unique category a numeric label. It's useful when the categorical feature has an inherent order (ordinal data), like "Low", "Medium", "High".

Here, stress Levels are not ordinal, but the encoding would still work as it maps it alphabetically \
not stressed -> 0 \
stressed -> 1

In [24]:
# Label Encoding (for ordinal features)
label_encoder = LabelEncoder()

df_label_encoded = df.copy()

df_label_encoded['Stress Levels'] = label_encoder.fit_transform(df_label_encoded['Stress Levels'])

print("\nDataFrame with Label Encoding :")
print(df_label_encoded.head())


DataFrame with Label Encoding :
   Snoring Rate  Respiratory Rate  Body Temperature  Limb Movement  \
0         93.80            25.680            91.840         16.600   
1         91.64            25.104            91.552         15.880   
2         60.00            20.000            96.000         10.000   
3         85.76            23.536            90.768         13.920   
4         48.12            17.248            97.872          6.496   

   Blood Oxygen  Eye Movement  Sleep Hours  Heart Rate  Stress Levels  
0        89.840         99.60        1.840       74.20              1  
1        89.552         98.88        1.552       72.76              1  
2        95.000         85.00        7.000       60.00              0  
3        88.768         96.92        0.768       68.84              1  
4        96.248         72.48        8.248       53.12              0  



**One-Hot Encoding:** This method creates a binary column for each unique category and assigns a 1 or 0 based on whether a row contains that category. It's generally used for nominal (non-ordinal) categorical variables, like "stressed", "not stressed"

In [33]:
df_onehot = df.copy()

df_onehot = pd.get_dummies(df_onehot, columns=['Stress Levels'], drop_first=True, prefix='', prefix_sep='', dtype='int')

df_onehot.rename(columns={'stressed': 'Stress Levels'}, inplace=True)

print("\nDataFrame with One-Hot Encoding (Nominal):")
print(df_onehot.head())


DataFrame with One-Hot Encoding (Nominal):
   Snoring Rate  Respiratory Rate  Body Temperature  Limb Movement  \
0         93.80            25.680            91.840         16.600   
1         91.64            25.104            91.552         15.880   
2         60.00            20.000            96.000         10.000   
3         85.76            23.536            90.768         13.920   
4         48.12            17.248            97.872          6.496   

   Blood Oxygen  Eye Movement  Sleep Hours  Heart Rate  Stress Levels  
0        89.840         99.60        1.840       74.20              1  
1        89.552         98.88        1.552       72.76              1  
2        95.000         85.00        7.000       60.00              0  
3        88.768         96.92        0.768       68.84              1  
4        96.248         72.48        8.248       53.12              0  


there would have been 2 columns, not stressed and stressed \
by `drop_first=True`, we drop the first column, as it can be infered from the others \
and stressed is renamed to Stress Levels

# Handling Outliers

In [50]:
# Define a function to remove outliers based on the IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    # Define the acceptable range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Clip the outliers
    df[column] = df[column].clip(lower_bound, upper_bound)


df_cleaned = df_onehot.copy()
# Apply this function to all relevant columns
for col in df_cleaned.columns[:-1]:  # Exclude 'Stress Levels'
    remove_outliers(df_cleaned, col)

**What it does:**

It limits (or "clips") the values in df[column] so that any value below lower_bound is set to lower_bound, and any value above upper_bound is set to upper_bound.\
This keeps all values within the specified range, replacing extreme outliers with the closest boundary value instead of removing them.

# Standardize and Normalize Features

In [51]:
# Using StandardScaler
scaler = StandardScaler()
normalizer = MinMaxScaler()

data_standardized = scaler.fit_transform(df_cleaned.drop(columns=['Stress Levels']))
data_normalized = normalizer.fit_transform(data_standardized)

# Convert back to a DataFrame and add the target column
df_scaled_normalized = pd.DataFrame(data_normalized, columns=df_cleaned.columns[:-1])
df_scaled_normalized["Stress Levels"] = df_cleaned["Stress Levels"]

print(df_scaled_normalized.head())

   Snoring Rate  Respiratory Rate  Body Temperature  Limb Movement  \
0      0.887273          0.691429          0.488571       0.840000   
1      0.848000          0.650286          0.468000       0.792000   
2      0.272727          0.285714          0.785714       0.400000   
3      0.741091          0.538286          0.412000       0.661333   
4      0.056727          0.089143          0.919429       0.166400   

   Blood Oxygen  Eye Movement  Sleep Hours  Heart Rate  Stress Levels  
0      0.522667      0.880000     0.204444    0.691429              1  
1      0.503467      0.864000     0.172444    0.650286              1  
2      0.866667      0.555556     0.777778    0.285714              0  
3      0.451200      0.820444     0.085333    0.538286              1  
4      0.949867      0.277333     0.916444    0.089143              0  


This standardizes features to have a mean of 0 and a standard deviation of 1. \
Then normalizes the data to scale it between 0 and 1.

## **What Does `fit_transform` Do?**

1. **Fit**:  
   - Computes the parameters needed for transformation based on the data provided.
     - For **StandardScaler**: Calculates mean (\(\mu\)) and standard deviation (\(\sigma\)).
     - For **MinMaxScaler**: Calculates the minimum and maximum values.
   - These parameters are saved internally to be used later for consistent scaling.

2. **Transform**:  
   - Applies the transformation using the parameters calculated during the `fit` step.
   - Modifies the data accordingly:
     - For **StandardScaler**: 
      $ z = \frac{(x - \mu)}{\sigma} $

     - For **MinMaxScaler**: 
      $ x' = \frac{(x - \text{min})}{\text{max} - \text{min}} $

# Save the processed data

In [52]:
# Export the final data as csv
df_scaled_normalized.to_csv('../Dataset/Human Stress Dataset Preprocessed.csv', index=False)
print("Preprocessed dataset saved as 'Human Stress Dataset Preprocessed.csv'")

Preprocessed dataset saved as 'Human Stress Dataset Preprocessed.csv'


# Split Data into train and test

In [54]:
# Define features (X) and target variable (y)
X = df.drop(columns=['Stress Levels'])  # Features
y = df['Stress Levels']  # Target

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")


Training Data Shape: (504, 8)
Testing Data Shape: (126, 8)


The `random_state` parameter is used to ensure reproducibility of the results. By setting a specific value for `random_state`, we ensure that the data is split in the same way every time you run the code. 

```python
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
```

By using `random_state=42`, we ensure that the split of the data into training and testing sets is consistent across different executions of the notebook.

## Save the train test data

In [55]:
# save the train test data
X_train.to_csv('../Dataset/X_train.csv', index=False)
X_test.to_csv('../Dataset/X_test.csv', index=False)
y_train.to_csv('../Dataset/y_train.csv', index=False)
y_test.to_csv('../Dataset/y_test.csv', index=False)
print("Train and Test data saved as 'X_train.csv', 'X_test.csv', 'y_train.csv', 'y_test.csv'")

Train and Test data saved as 'X_train.csv', 'X_test.csv', 'y_train.csv', 'y_test.csv'
