In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Load Dataset
file_path = '../data/Lifestyle & Health Risk Prediction in Young Adults dataset.xlsx'
try:
    df = pd.read_excel(file_path)
except FileNotFoundError:
    df = pd.read_excel('data/Lifestyle & Health Risk Prediction in Young Adults dataset.xlsx')

df.columns = df.columns.str.strip()

# Dropping unnecessary columns
drop_cols = ['Timestamp', 'Q0. Consent for Participation']
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')
print(f"Columns after drop: {df.columns.tolist()}")

Columns after drop: ['Age', 'Gender', 'meals_intake_per_day', 'skipped_meal', 'diet_quality', 'water_intake_daily', 'sleep_duration_per_night', 'sleep_time', 'wake_up_time', 'plays_sport_regularly', 'gym_or_workout', 'daily_walk_30min', 'physical_activity_level_weekly', 'screen_time_hours', 'study_work_hours', 'stress_level', 'smoking_alcohol', 'diagnosed_health_issue', 'health_issue_type', 'current_health_rating', 'health_risk_level']


In [None]:
# Preprocessing the Age Column
def process_age(val):
    if pd.isna(val): return np.nan
    val = str(val).strip()
    if '-' in val:
        try:
            low, high = map(int, val.split('-'))
            return (low + high) / 2
        except:
            pass
    if 'Above' in val or '>' in val: return 35.0
    if 'Below' in val or '<' in val: return 17.0
    try: return float(val)
    except: return np.nan

if 'Age' in df.columns:
    df['Age'] = df['Age'].apply(process_age)
    print("Age processing done.")

print(df['Age'])

Age processing done.
0      19.0
1      22.0
2      19.0
3      19.0
4      19.0
       ... 
209    22.0
210    19.0
211    19.0
212    19.0
213    19.0
Name: Age, Length: 214, dtype: float64


In [15]:
# Text Mining - Extracting the words from health_issue_type column
if 'health_issue_type' in df.columns:
    df['health_issue_type'] = df['health_issue_type'].fillna('None')
    tfidf = TfidfVectorizer(max_features=10, stop_words='english')
    try:
        tfidf_matrix = tfidf.fit_transform(df['health_issue_type'].astype(str))
        tfidf_cols = [f'issue_tfidf_{i}' for i in range(tfidf_matrix.shape[1])]
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_cols, index=df.index)
        df = pd.concat([df, tfidf_df], axis=1)
        df = df.drop(columns=['health_issue_type'])
        print("TF-IDF done.")
    except ValueError:
        print("TF-IDF skipped (empty vocab or other error)")

TF-IDF done.


In [16]:
# Clean numeric cols
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

if num_cols:
    imputer_num = SimpleImputer(strategy='median')
    df[num_cols] = imputer_num.fit_transform(df[num_cols])

if cat_cols:
    imputer_cat = SimpleImputer(strategy='most_frequent')
    df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

In [17]:
# Target Engineering
le_target = LabelEncoder()
if 'health_risk_level' in df.columns:
    df['health_risk_level_encoded'] = le_target.fit_transform(df['health_risk_level'].astype(str))

# stress_level column Mapping
stress_map = {'Low': 1, 'Moderate': 2, 'High': 3}
if 'stress_level' in df.columns:
    if df['stress_level'].dtype == object:
         df['stress_level'] = df['stress_level'].map(stress_map).fillna(0)

def clean_num(x):
    try: return float(re.findall(r"[\d\.]+", str(x))[0])
    except: return 0

for c in ['screen_time_hours', 'sleep_duration_per_night', 'water_intake_daily']:
    if c in df.columns:
        df[c] = df[c].apply(clean_num)

df['Health_Risk_Score'] = (
    df.get('stress_level', 0) * 10 
    + df.get('screen_time_hours', 0) * 2 
    - df.get('sleep_duration_per_night', 7) * 3 
    - df.get('water_intake_daily', 2) * 5
) + 50 # Base score


In [18]:
# Categorical Encoding
df = pd.get_dummies(df, drop_first=True)

# Scaling
scaler = StandardScaler()
feature_cols = [c for c in df.columns if c not in ['health_risk_level', 'health_risk_level_encoded', 'Health_Risk_Score']]

print(f"Features to Scale: {len(feature_cols)}")

if len(feature_cols) > 0:
    df[feature_cols] = scaler.fit_transform(df[feature_cols])
else:
    print("CRITICAL WARNING: No features to scale!")

df.to_csv('../data/processed_data.csv', index=False)
print("SAVED ../data/processed_data.csv")

Features to Scale: 55
SAVED ../data/processed_data.csv
