In [2]:
# Inspect the initial structure of the dataset
print("Initial DataFrame:")
print(df.info())

print("\nFirst few rows:")
print(df.head())

# Print actual column names to find mismatches
print("Columns in DataFrame:")
print(df.columns)

# Standardize column names (strip whitespace)
df.columns = df.columns.str.strip()

# Select relevant columns (Ensure correct names)
relevant_columns = ['DAILY_STRESS', 'AGE', 'GENDER', 'INCOME', 'SLEEP_HOURS']  # Define expected column names
df = df[[col for col in relevant_columns if col in df.columns]]

# Standardize the age category
df['AGE'] = df['AGE'].replace({'Less than 20': '20 or less'})

# Convert 'DAILY_STRESS' and 'SLEEP_HOURS' to numeric, coercing errors
df['DAILY_STRESS'] = pd.to_numeric(df['DAILY_STRESS'], errors='coerce')
df['SLEEP_HOURS'] = pd.to_numeric(df['SLEEP_HOURS'], errors='coerce')

# Remove duplicates if any
df.drop_duplicates(inplace=True)

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)

# Handle missing values - drop rows with key variables missing
df.dropna(subset=['DAILY_STRESS', 'AGE', 'GENDER', 'INCOME', 'SLEEP_HOURS'], inplace=True)

# Generate descriptive statistics
descriptive_summary = df.describe(include='all')
print("\nDescriptive summary of the cleaned data:")
print(descriptive_summary)

# Save the cleaned dataset
df.to_csv('cleaned_work_life_balance.csv', index=False)
print("\nData cleaning complete. Cleaned data saved to 'cleaned_work_life_balance.csv'.")


NameError: name 'pd' is not defined

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

# Load the dataset
url = "healthcare-dataset-stroke-data.csv"  # Ensure this file is in your project directory
df = pd.read_csv(url)

# Display initial dataset information
print("Initial Dataset Information:")
print(df.info())
print("\nMissing Values Before Cleaning:")
print(df.isnull().sum())

# Select relevant features (correcting the 'marriage' column name to 'ever_married')
features = [
    'age', 'gender', 'hypertension', 'heart_disease', 
    'ever_married', 'work_type', 'bmi', 'smoking_status', 'stroke'
]
df = df[features]

# Handle missing values in 'bmi' using KNN imputation
# First, encode categorical variables
label_encoders = {}
categorical_cols = ['gender', 'ever_married', 'work_type', 'smoking_status']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Convert to string to avoid errors
    label_encoders[col] = le

# Initialize KNN Imputer to fill missing values
imputer = KNNImputer(n_neighbors=5)
df[['bmi']] = imputer.fit_transform(df[['bmi']])

# Convert 'stroke' column to binary (0 and 1)
df['stroke'] = df['stroke'].astype(int)

# Display missing values after cleaning
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

# Correlation analysis
correlation_matrix = df.corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Split features and target variable
X = df.drop('stroke', axis=1)  # Features
y = df['stroke']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features with Min-Max Scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

# Check the new class distribution
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_balanced).value_counts())

# Output final cleaned dataset (optional)
final_cleaned_df = pd.DataFrame(X_train_balanced, columns=X.columns)
final_cleaned_df['stroke'] = y_train_balanced

# Save cleaned dataset if needed
final_cleaned_df.to_csv('cleaned_stroke_prediction_dataset.csv', index=False)

print("Data cleaning and preprocessing completed successfully.")
