<a href="https://colab.research.google.com/github/YashBhardwaj21/codezilla/blob/main/codezilla2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas scikit-learn




In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

# Load the CSV file (update the path if using Google Drive)
data = pd.read_csv('health_lifestyle_classification.csv')

# Display the first few rows and data info
print(data.head())
print(data.info())

   survey_code  age  gender      height     weight        bmi  bmi_estimated  \
0            1   56    Male  173.416872  56.886640  18.915925      18.915925   
1            2   69  Female  163.207380  97.799859  36.716278      36.716278   
2            3   46    Male  177.281966  80.687562  25.673050      25.673050   
3            4   32  Female  172.101255  63.142868  21.318480      21.318480   
4            5   60  Female  163.608816  40.000000  14.943302      14.943302   

   bmi_scaled  bmi_corrected  waist_size  ...  sunlight_exposure  \
0   56.747776      18.989117   72.165130  ...               High   
1  110.148833      36.511417   85.598889  ...               High   
2   77.019151      25.587429   90.295030  ...               High   
3   63.955440      21.177109  100.504211  ...               High   
4   44.829907      14.844299   69.021150  ...               High   

   meals_per_day  caffeine_intake  family_history  pet_owner  \
0              5         Moderate             

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Separate features and target
X = data.drop('target', axis=1)  # All columns except 'target'
y = data['target']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a pipeline with preprocessing and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.69835

Classification Report:
               precision    recall  f1-score   support

    diseased       0.19      0.00      0.00      6023
     healthy       0.70      1.00      0.82     13977

    accuracy                           0.70     20000
   macro avg       0.44      0.50      0.41     20000
weighted avg       0.54      0.70      0.58     20000



In [None]:
import joblib

# Save the model
joblib.dump(model, 'health_prediction_model.pkl')

# To load the model later
# model = joblib.load('health_prediction_model.pkl')

['health_prediction_model.pkl']

In [None]:
import joblib

# Load the trained model
model = joblib.load('health_prediction_model.pkl')  # Update path if on Google Drive

In [None]:
import pandas as pd

# Create single test sample with provided values
single_sample = pd.DataFrame({
    'survey_code': [143],
    'age': [34],
    'gender': ['Male'],
    'height': [160.8129287],
    'weight': [87.99562442],
    'bmi': [34.02664692],
    'bmi_estimated': [34.02664692],
    'bmi_scaled': [102.0799408],
    'bmi_corrected': [34.13719895],
    'waist_size': [98.41608367],
    'blood_pressure': [111.98239],
    'heart_rate': [80.19802017],
    'cholesterol': [128.0392769],
    'glucose': [111.2221661],
    'insulin': [18.11037723],
    'sleep_hours': [7.479686019],
    'sleep_quality': ['Excellent'],
    'work_hours': [8.277346382],
    'physical_activity': [4.50562205],
    'daily_steps': [8502.977659],
    'calorie_intake': [2191.652711],
    'sugar_intake': [65.62886298],
    'alcohol_consumption': ['None'],
    'smoking_level': ['Light'],
    'water_intake': [2.947351133],
    'screen_time': [5.982054146],
    'stress_level': [2],
    'mental_health_score': [2],
    'mental_health_support': ['No'],
    'education_level': ['Bachelor'],
    'job_type': ['Healthcare'],
    'occupation': ['Doctor'],
    'income': [4639.158189],
    'diet_type': ['Omnivore'],
    'exercise_type': ['Mixed'],
    'device_usage': ['Low'],
    'healthcare_access': ['Moderate'],
    'insurance': ['No'],
    'sunlight_exposure': ['Moderate'],
    'meals_per_day': [4],
    'caffeine_intake': ['High'],
    'family_history': ['Yes'],
    'pet_owner': ['Yes'],
    'electrolyte_level': [0],
    'gene_marker_flag': [1],
    'environmental_risk_score': [5.5],
    'daily_supplement_dosage': [8.188736505]
})

# Display sample to verify
print(single_sample)

   survey_code  age gender      height     weight        bmi  bmi_estimated  \
0          143   34   Male  160.812929  87.995624  34.026647      34.026647   

   bmi_scaled  bmi_corrected  waist_size  ...  insurance  sunlight_exposure  \
0  102.079941      34.137199   98.416084  ...         No           Moderate   

   meals_per_day  caffeine_intake  family_history  pet_owner  \
0              4             High             Yes        Yes   

  electrolyte_level  gene_marker_flag  environmental_risk_score  \
0                 0                 1                       5.5   

   daily_supplement_dosage  
0                 8.188737  

[1 rows x 47 columns]


In [None]:
# Make prediction
prediction = model.predict(single_sample)

# Display the prediction
print("Prediction for single sample:", prediction[0])

Prediction for single sample: healthy


In [None]:
from sklearn.metrics import accuracy_score, classification_report
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))

Training Accuracy: 1.0
Test Accuracy: 0.69835

Test Classification Report:
               precision    recall  f1-score   support

    diseased       0.19      0.00      0.00      6023
     healthy       0.70      1.00      0.82     13977

    accuracy                           0.70     20000
   macro avg       0.44      0.50      0.41     20000
weighted avg       0.54      0.70      0.58     20000



In [None]:
feature_importances = model.named_steps['classifier'].feature_importances_
numerical_cols = data.drop('target', axis=1).select_dtypes(include=['float64', 'int64']).columns
categorical_cols = data.drop('target', axis=1).select_dtypes(include=['object']).columns
feature_names = numerical_cols.tolist() + model.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols).tolist()
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
print(importance_df.sort_values(by='Importance', ascending=False).head(10))

                    Feature  Importance
11              cholesterol    0.033510
0               survey_code    0.033303
15               work_hours    0.033215
18           calorie_intake    0.033183
29  daily_supplement_dosage    0.033162
19             sugar_intake    0.033122
8                waist_size    0.032991
14              sleep_hours    0.032808
12                  glucose    0.032801
20             water_intake    0.032778


In [None]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=100, random_state=42))
])
model.fit(X_train, y_train_encoded)
y_pred = le.inverse_transform(model.predict(X_test))
print("XGBoost Test Classification Report:\n", classification_report(y_test, y_pred))

XGBoost Test Classification Report:
               precision    recall  f1-score   support

    diseased       0.31      0.03      0.05      6023
     healthy       0.70      0.97      0.81     13977

    accuracy                           0.69     20000
   macro avg       0.51      0.50      0.43     20000
weighted avg       0.58      0.69      0.58     20000



In [None]:
print(data.groupby('target')[['bmi', 'cholesterol', 'glucose', 'blood_pressure', 'smoking_level']].describe())

              bmi                                                        \
            count       mean       std        min        25%        50%   
target                                                                    
diseased  29903.0  24.516903  5.930095  10.014093  20.311337  24.257080   
healthy   70097.0  24.484052  5.960009   9.988495  20.252486  24.111666   

                               cholesterol              ...     glucose  \
                75%        max       count        mean  ...         75%   
target                                                  ...               
diseased  28.295897  56.286478     29903.0  190.036612  ...  113.645041   
healthy   28.237929  59.234792     70097.0  189.936502  ...  113.462469   

                     blood_pressure                                    \
                 max          count        mean        std        min   
target                                                                  
diseased  179.855753        2

In [None]:
feature_importances = model.named_steps['classifier'].feature_importances_
feature_names = numerical_cols.tolist() + model.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols).tolist()
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
print(importance_df.sort_values(by='Importance', ascending=False).head(10))

                       Feature  Importance
69      healthcare_access_Good    0.018156
30               gender_Female    0.016682
76  sunlight_exposure_Moderate    0.016620
47         job_type_Healthcare    0.016278
39         smoking_level_Light    0.016262
55           occupation_Driver    0.015966
70  healthcare_access_Moderate    0.015910
52         job_type_Unemployed    0.015646
35          sleep_quality_Poor    0.015631
60          diet_type_Omnivore    0.015252


In [None]:
import pandas as pd
import joblib

# Load new model
model = joblib.load('health_prediction_model.pkl')  # Update path

# Test sample
single_sample = pd.DataFrame({
    'survey_code': [143],  # Remove if dropped during retraining
    'age': [34],
    'gender': ['Male'],
    'height': [160.8129287],
    'weight': [87.99562442],
    'bmi': [34.02664692],
    'bmi_estimated': [34.02664692],
    'bmi_scaled': [102.0799408],
    'bmi_corrected': [34.13719895],
    'waist_size': [98.41608367],
    'blood_pressure': [111.98239],
    'heart_rate': [80.19802017],
    'cholesterol': [128.0392769],
    'glucose': [111.2221661],
    'insulin': [18.11037723],
    'sleep_hours': [7.479686019],
    'sleep_quality': ['Excellent'],
    'work_hours': [8.277346382],
    'physical_activity': [4.50562205],
    'daily_steps': [8502.977659],
    'calorie_intake': [2191.652711],
    'sugar_intake': [65.62886298],
    'alcohol_consumption': ['None'],
    'smoking_level': ['Light'],
    'water_intake': [2.947351133],
    'screen_time': [5.982054146],
    'stress_level': [2],
    'mental_health_score': [2],
    'mental_health_support': ['No'],
    'education_level': ['Bachelor'],
    'job_type': ['Healthcare'],
    'occupation': ['Doctor'],
    'income': [4639.158189],
    'diet_type': ['Omnivore'],
    'exercise_type': ['Mixed'],
    'device_usage': ['Low'],
    'healthcare_access': ['Moderate'],
    'insurance': ['No'],
    'sunlight_exposure': ['Moderate'],
    'meals_per_day': [4],
    'caffeine_intake': ['High'],
    'family_history': ['Yes'],
    'pet_owner': ['Yes'],
    'electrolyte_level': [0],
    'gene_marker_flag': [1],
    'environmental_risk_score': [5.5],
    'daily_supplement_dosage': [8.188736505]
})

# Drop survey_code if removed during retraining
# single_sample = single_sample.drop('survey_code', axis=1)

# Make prediction
prediction = model.predict(single_sample)
print("Prediction for single sample:", prediction[0])

Prediction for single sample: healthy


In [None]:
data['is_obese'] = (data['bmi'] >= 30).astype(int)  # WHO: BMI ≥ 30
data['is_hypertensive'] = (data['blood_pressure'] >= 130).astype(int)  # AHA: ≥130 mmHg
data['is_tachycardic'] = (data['heart_rate'] > 100).astype(int)  # Mayo Clinic: >100 bpm
data['is_high_cholesterol'] = (data['cholesterol'] >= 240).astype(int)  # CDC: ≥240 mg/dL
data['is_diabetic'] = (data['glucose'] >= 126).astype(int)  # ADA: ≥126 mg/dL
data['is_insufficient_sleep'] = (data['sleep_hours'] < 7).astype(int)  # NSF: <7 hours
data['is_low_activity'] = (data['physical_activity'] < 0.36).astype(int)  # WHO: <150 min/week
data['is_high_sugar'] = (data['sugar_intake'] > 36).astype(int)  # AHA: >36 g/day (men)
data['is_low_water'] = (data['water_intake'] < 2).astype(int)  # Mayo Clinic: <2 L/day
data['is_high_screen_time'] = (data['screen_time'] > 2).astype(int)  # AAP: >2 hours/day
data['is_high_stress'] = (data['stress_level'] >= 4).astype(int)  # Assumed high stress

# Define prioritized features
prioritized_features = [
    'age', 'gender', 'bmi', 'blood_pressure', 'heart_rate', 'cholesterol', 'glucose',
    'sleep_hours', 'physical_activity', 'sugar_intake', 'alcohol_consumption',
    'smoking_level', 'water_intake', 'screen_time', 'stress_level', 'family_history',
    'environmental_risk_score', 'is_obese', 'is_hypertensive', 'is_tachycardic',
    'is_high_cholesterol', 'is_diabetic', 'is_insufficient_sleep', 'is_low_activity',
    'is_high_sugar', 'is_low_water', 'is_high_screen_time', 'is_high_stress'
]

# Identify features to keep (prioritized + target)
all_features = data.columns.tolist()
features_to_keep = [col for col in all_features if col in prioritized_features or col == 'target']
data = data[features_to_keep]

# Split data
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Train model with class weights
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
])
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Test Classification Report:\n", classification_report(y_test, y_pred))

# Feature importance
feature_importances = model.named_steps['classifier'].feature_importances_
feature_names = numerical_cols.tolist() + model.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols).tolist()
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
print("\nTop 10 Feature Importances:\n", importance_df.sort_values(by='Importance', ascending=False).head(10))

# Save new model
joblib.dump(model, 'health_prediction_model_balanced.pkl')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test Classification Report:
               precision    recall  f1-score   support

    diseased       0.00      0.00      0.00      6023
     healthy       0.70      1.00      0.82     13977

    accuracy                           0.70     20000
   macro avg       0.35      0.50      0.41     20000
weighted avg       0.49      0.70      0.57     20000


Top 10 Feature Importances:
               Feature  Importance
8        sugar_intake    0.078025
4         cholesterol    0.078016
5             glucose    0.077756
1                 bmi    0.077613
9        water_intake    0.077227
10        screen_time    0.076642
6         sleep_hours    0.076636
2      blood_pressure    0.076284
7   physical_activity    0.074337
3          heart_rate    0.074075


['health_prediction_model_balanced.pkl']

In [None]:
import joblib

# Save the model
joblib.dump(model, 'health_prediction_model_balanced.pkl')

# To load the model later
# model = joblib.load('health_prediction_model.pkl')

['health_prediction_model_balanced.pkl']

In [None]:
new_input = {
    "age": 45,
    "gender": "Male",
    "bmi": 31,
    "blood_pressure": 145,
    "heart_rate": 102,
    "cholesterol": 250,
    "glucose": 135,
    "sleep_hours": 6,
    "physical_activity": 0.2,
    "sugar_intake": 40,
    "alcohol_consumption": 2,
    "smoking_level": 1,
    "water_intake": 1.5,
    "screen_time": 4,
    "stress_level": 5,
    "family_history": "Yes",
    "environmental_risk_score": 3
}


In [None]:
print(model.feature_names_in_)
print(sample_df.columns)


['age' 'gender' 'bmi' 'blood_pressure' 'heart_rate' 'cholesterol'
 'glucose' 'sleep_hours' 'physical_activity' 'sugar_intake'
 'alcohol_consumption' 'smoking_level' 'water_intake' 'screen_time'
 'stress_level' 'family_history' 'environmental_risk_score' 'is_obese'
 'is_hypertensive' 'is_tachycardic' 'is_high_cholesterol' 'is_diabetic'
 'is_insufficient_sleep' 'is_low_activity' 'is_high_sugar' 'is_low_water'
 'is_high_screen_time' 'is_high_stress']


NameError: name 'sample_df' is not defined

In [None]:
import joblib
model = joblib.load('health_prediction_model_balanced.pkl')



In [None]:
import pandas as pd
import joblib

# Step 1: Load the model
model = joblib.load('health_prediction_model.pkl')  # Make sure the path is correct

# Step 2: Create a single test sample with the new values
single_sample = pd.DataFrame({
    'survey_code': [228],  # Remove if dropped during retraining
    'age': [37],
    'gender': ['Female'],
    'height': [165.2643742],
    'weight': [72.09867908],
    'bmi': [26.39786607],
    'bmi_estimated': [26.39786607],
    'bmi_scaled': [79.19359822],
    'bmi_corrected': [26.38340278],
    'waist_size': [82.70818341],
    'blood_pressure': [128.914122],
    'heart_rate': [64.77061449],
    'cholesterol': [207.3664026],
    'glucose': [114.5554672],
    'insulin': [10.83789669],
    'sleep_hours': [5.7898391],
    'sleep_quality': ['Excellent'],
    'work_hours': [7.967205574],
    'physical_activity': [7.741550245],
    'daily_steps': [6993.009947],
    'calorie_intake': [1886.938351],
    'sugar_intake': [33.84820361],
    'alcohol_consumption': ['Occasionally'],
    'smoking_level': ['Heavy'],
    'water_intake': [2.506330931],
    'screen_time': [3.519238703],
    'stress_level': [4],
    'mental_health_score': [2],
    'mental_health_support': ['Yes'],
    'education_level': ['Master'],
    'job_type': ['Healthcare'],
    'occupation': ['Farmer'],
    'income': [3066.46511],
    'diet_type': ['Vegan'],
    'exercise_type': ['None'],
    'device_usage': ['Low'],
    'healthcare_access': ['Poor'],
    'insurance': ['Yes'],
    'sunlight_exposure': ['Moderate'],
    'meals_per_day': [1],
    'caffeine_intake': ['Moderate'],
    'family_history': ['No'],
    'pet_owner': ['No'],
    'electrolyte_level': [0],
    'gene_marker_flag': [1],
    'environmental_risk_score': [5.5],
    'daily_supplement_dosage': [0.959270806]
})

# Step 3: Drop survey_code if it was dropped during training
# single_sample = single_sample.drop('survey_code', axis=1)

# Step 4: Make prediction
prediction = model.predict(single_sample)

# Step 5: Print result
print("Prediction for single sample:", prediction[0])

Prediction for single sample: healthy


In [None]:
import pandas as pd
import joblib

# Load model
model = joblib.load('health_prediction_model.pkl')

# Create sample
single_sample = pd.DataFrame({
    'survey_code': [439],
    'age': [62],
    'gender': ['Female'],
    'height': [177.7875646],
    'weight': [93.32502933],
    'bmi': [29.52537167],
    'bmi_estimated': [29.52537167],
    'bmi_scaled': [88.57611501],
    'bmi_corrected': [29.34003662],
    'waist_size': [89.32695097],
    'blood_pressure': [102.9024059],
    'heart_rate': [78.74454974],
    'cholesterol': [172.1856915],
    'glucose': [93.30532261],
    'insulin': [14.43922612],
    'sleep_hours': [5.686520077],
    'sleep_quality': ['Poor'],
    'work_hours': [7.131966471],
    'physical_activity': [2.118422661],
    'daily_steps': [4754.937587],
    'calorie_intake': [2797.524885],
    'sugar_intake': [9.63869538],
    'alcohol_consumption': [None],  # Was missing in original data
    'smoking_level': ['Heavy'],
    'water_intake': [1.695994361],
    'screen_time': [6.967413943],
    'stress_level': [9],
    'mental_health_score': [3],
    'mental_health_support': ['No'],
    'education_level': ['Bachelor'],
    'job_type': ['Labor'],
    'occupation': ['Engineer'],
    'income': [5820.958207],
    'diet_type': ['Vegan'],
    'exercise_type': ['Cardio'],
    'device_usage': ['High'],
    'healthcare_access': ['Moderate'],
    'insurance': ['Yes'],
    'sunlight_exposure': ['High'],
    'meals_per_day': [3],
    'caffeine_intake': ['High'],
    'family_history': ['Yes'],
    'pet_owner': ['Yes'],
    'electrolyte_level': [0],
    'gene_marker_flag': [1],
    'environmental_risk_score': [5.5],
    'daily_supplement_dosage': [-6.909213923]
})

# Make prediction
prediction = model.predict(single_sample)
print("Prediction:", prediction[0])

Prediction: diseased


In [None]:
import pandas as pd
import joblib

# Load model
model = joblib.load('health_prediction_model.pkl')

# Get user inputs
print("Please provide the following information (press Enter to skip any field):")
user_inputs = {
    'age': input("Age: ") or None,
    'gender': input("Gender (Male/Female/Other): ") or None,
    'height': input("Height (cm): ") or None,
    'weight': input("Weight (kg): ") or None,
    'blood_pressure': input("Blood Pressure: ") or None,
    'cholesterol': input("Cholesterol: ") or None,
    'glucose': input("Glucose: ") or None,
    'stress_level': input("Stress Level (1-10): ") or None
}

# Convert numeric inputs to appropriate types
for key in ['age', 'height', 'weight', 'blood_pressure', 'cholesterol', 'glucose', 'stress_level']:
    if user_inputs[key] is not None:
        user_inputs[key] = float(user_inputs[key])

# Calculate BMI if possible
if user_inputs['height'] and user_inputs['weight']:
    height_m = float(user_inputs['height'])/100
    user_inputs['bmi'] = float(user_inputs['weight'])/(height_m**2)
else:
    user_inputs['bmi'] = None

# Create DataFrame with all model-required columns
# First create dictionary with all possible columns set to None
all_columns = {col: None for col in model.feature_names_in_} if hasattr(model, 'feature_names_in_') else {}
# Update with user provided values
all_columns.update({k: v for k, v in user_inputs.items() if k in all_columns})

single_sample = pd.DataFrame([all_columns])

# Make prediction
try:
    prediction = model.predict(single_sample)
    print("\nPredicted Health Status:", prediction[0])
except Exception as e:
    print(f"\nPrediction failed: {str(e)}")
    print("The model requires more features to make a prediction.")


Please provide the following information (press Enter to skip any field):
Age: 29
Gender (Male/Female/Other): 
Height (cm): 170
Weight (kg): 78
Blood Pressure: 133.1043
Cholesterol: 91.4978
Glucose: 209.3348
Stress Level (1-10): 2

Predicted Health Status: healthy


In [None]:
import pandas as pd
import joblib

# Load model
model = joblib.load('health_prediction_model.pkl')

# Define all fields with their data types
field_types = {
    'survey_code': int,  # Added survey_code to field types
    'age': int,
    'gender': str,
    'height': float,
    'weight': float,
    'bmi': float,
    'blood_pressure': float,
    'cholesterol': float,
    'glucose': float,
    'sleep_hours': float,
    'physical_activity': float,
    'stress_level': int,
    'smoking_level': str,
    'alcohol_consumption': str,
    # Other non-compulsory fields...
    'bmi_estimated': float,
    'bmi_scaled': float,
    'bmi_corrected': float,
    'waist_size': float,
    'heart_rate': float,
    'insulin': float,
    'sleep_quality': str,
    'work_hours': float,
    'daily_steps': float,
    'calorie_intake': float,
    'sugar_intake': float,
    'water_intake': float,
    'screen_time': float,
    'mental_health_score': int,
    'mental_health_support': str,
    'education_level': str,
    'job_type': str,
    'occupation': str,
    'income': float,
    'diet_type': str,
    'exercise_type': str,
    'device_usage': str,
    'healthcare_access': str,
    'insurance': str,
    'sunlight_exposure': str,
    'meals_per_day': int,
    'caffeine_intake': str,
    'family_history': str,
    'pet_owner': str,
    'electrolyte_level': int,
    'gene_marker_flag': int,
    'environmental_risk_score': float,
    'daily_supplement_dosage': float
}

# Compulsory fields (13 most important)
compulsory_fields = [
    'age',
    'gender',
    'height',
    'weight',
    'blood_pressure',
    'cholesterol',
    'glucose',
    'bmi',
    'smoking_level',
    'alcohol_consumption',
    'physical_activity',
    'stress_level',
    'sleep_hours'
]

# Categorical options
categorical_options = {
    'gender': ['Male', 'Female', 'Other'],
    'smoking_level': ['None', 'Light', 'Moderate', 'Heavy'],
    'alcohol_consumption': ['None', 'Occasionally', 'Regularly', 'Heavy'],
    'sleep_quality': ['Poor', 'Fair', 'Good', 'Excellent'],
    'mental_health_support': ['Yes', 'No'],
    'diet_type': ['Omnivore', 'Vegetarian', 'Vegan'],
    'exercise_type': ['None', 'Cardio', 'Strength', 'Mixed']
}

def get_compulsory_input():
    print("\n=== COMPULSORY FIELDS ===")
    print("Please provide these 13 key health metrics:\n")

    user_inputs = {}
    for field in compulsory_fields:
        while True:
            if field in categorical_options:
                print(f"Options for {field}: {', '.join(categorical_options[field])}")
            value = input(f"Enter value for {field}: ").strip()

            if not value:
                print("This field is required. Please enter a value.")
                continue

            try:
                user_inputs[field] = field_types[field](value)
                break
            except ValueError:
                print(f"Invalid format for {field}. Please try again.")

    # Calculate BMI if not provided but height/weight available
    if 'bmi' not in user_inputs and 'height' in user_inputs and 'weight' in user_inputs:
        height_m = user_inputs['height']/100
        user_inputs['bmi'] = user_inputs['weight']/(height_m**2)

    return user_inputs

def get_optional_input():
    print("\n=== OPTIONAL FIELDS ===")
    optional_fields = [f for f in field_types.keys() if f not in compulsory_fields and f != 'survey_code']

    if not optional_fields:
        return {}

    print("\nAvailable additional health metrics:")
    for i, field in enumerate(optional_fields, 1):
        print(f"{i}. {field}")

    selected = input("\nEnter numbers of additional fields to provide (comma separated, or press Enter to skip): ")
    selected_indices = [int(x.strip()) for x in selected.split(',') if x.strip().isdigit()]

    additional_inputs = {}
    for idx in selected_indices:
        if 1 <= idx <= len(optional_fields):
            field = optional_fields[idx-1]
            if field in categorical_options:
                print(f"\nOptions for {field}: {', '.join(categorical_options[field])}")
            value = input(f"Enter value for {field}: ")
            if value:
                try:
                    additional_inputs[field] = field_types[field](value)
                except ValueError:
                    print(f"Invalid format for {field}. Skipping this field.")

    return additional_inputs

# Main execution
print("=== Health Prediction System ===")
print("First provide 13 key health metrics, then optional ones if desired.\n")

while True:
    # Get compulsory inputs
    user_inputs = get_compulsory_input()

    # Get optional inputs
    additional_inputs = get_optional_input()
    user_inputs.update(additional_inputs)

    # Add survey_code with None if not provided
    if 'survey_code' not in user_inputs:
        user_inputs['survey_code'] = None

    # Prepare complete input with all expected columns
    complete_input = {field: None for field in field_types}
    complete_input.update(user_inputs)

    # Create DataFrame ensuring all columns are present
    single_sample = pd.DataFrame([complete_input])[list(field_types.keys())]

    # Make prediction
    try:
        prediction = model.predict(single_sample)
        print("\n=== Prediction Result ===")
        print(f"Health Risk Level: {prediction[0]}")

        # Basic interpretation
        if prediction[0] == 1:
            print("Interpretation: High health risk detected")
        else:
            print("Interpretation: Lower health risk profile")

        break
    except Exception as e:
        print("\nPrediction failed:", str(e))
        print("Missing columns:", [col for col in field_types.keys() if col not in single_sample.columns])
        retry = input("Would you like to try again? (yes/no): ")
        if retry.lower() != 'yes':
            break

=== Health Prediction System ===
First provide 13 key health metrics, then optional ones if desired.


=== COMPULSORY FIELDS ===
Please provide these 13 key health metrics:



KeyboardInterrupt: Interrupted by user

In [None]:
import pandas as pd

# Load the CSV file (update the path if using Google Drive)
data = pd.read_csv('Medicine_Details.csv')

# Display the first few rows and data info
print(data.head())
print(data.info())

              Medicine Name  \
0   Avastin 400mg Injection   
1  Augmentin 625 Duo Tablet   
2       Azithral 500 Tablet   
3          Ascoril LS Syrup   
4         Aciloc 150 Tablet   

                                         Composition  \
0                                Bevacizumab (400mg)   
1    Amoxycillin  (500mg) +  Clavulanic Acid (125mg)   
2                               Azithromycin (500mg)   
3  Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml)...   
4                                 Ranitidine (150mg)   

                                                Uses  \
0   Cancer of colon and rectum Non-small cell lun...   
1                  Treatment of Bacterial infections   
2                  Treatment of Bacterial infections   
3                      Treatment of Cough with mucus   
4  Treatment of Gastroesophageal reflux disease (...   

                                        Side_effects  \
0  Rectal bleeding Taste change Headache Noseblee...   
1  Vomiting Nausea Diarrhea

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [107]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import r2_score, mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# 1. Load and prepare data with proper index handling
data = pd.read_csv('Medicine_Details.csv').reset_index(drop=True)  # Ensure clean 0-based index

# 2. Text preprocessing with index preservation
def prepare_text(data):
    text_cols = ['Medicine Name', 'Uses', 'Side_effects']
    for col in text_cols:
        data[col] = data[col].fillna('').astype(str)
    texts = data['Medicine Name'] + " " + data['Uses'] + " " + data['Side_effects']
    return texts.reset_index(drop=True)  # Force clean indices

texts = prepare_text(data)
y = data['Excellent Review %'].values

# 3. Train-test split with index reset
X_train, X_test, y_train, y_test = train_test_split(
    texts,
    y,
    test_size=0.2,
    random_state=42
)

# Verify indices
print("X_train index range:", X_train.index.min(), X_train.index.max())  # Should be 0 to N-1
print("X_test index range:", X_test.index.min(), X_test.index.max())     # Should be 0 to M-1

# 4. Neural Network Implementation
class MedicineDataset(Dataset):
    def __init__(self, texts, targets, vectorizer):
        self.texts = texts.reset_index(drop=True).values  # Force array conversion
        self.targets = targets
        self.vectorizer = vectorizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        features = self.vectorizer.transform([text]).toarray().squeeze()
        return torch.FloatTensor(features), torch.FloatTensor([self.targets[idx]])

# Initialize vectorizer on training data only
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
vectorizer.fit(X_train)

# Create datasets with proper indexing
train_dataset = MedicineDataset(X_train, y_train, vectorizer)
test_dataset = MedicineDataset(X_test, y_test, vectorizer)

# 5. Model Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = nn.Sequential(
    nn.Linear(10000, 512),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(512, 1)
).to(device)

# Training loop
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

for epoch in range(10):
    model.train()
    for features, targets in train_loader:
        features, targets = features.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        test_features = torch.FloatTensor(vectorizer.transform(X_test).toarray()).to(device)
        preds = model(test_features).cpu().numpy()

    print(f"Epoch {epoch+1}")
    print(f"R2 Score: {r2_score(y_test, preds):.4f}")
    print(f"MSE: {mean_squared_error(y_test, preds):.4f}\n")

X_train index range: 1 11824
X_test index range: 0 11823
Epoch 1
R2 Score: 0.0003
MSE: 650.7282

Epoch 2
R2 Score: 0.0224
MSE: 636.3724

Epoch 3
R2 Score: 0.0202
MSE: 637.7720

Epoch 4
R2 Score: 0.0158
MSE: 640.6299

Epoch 5
R2 Score: 0.0052
MSE: 647.5676

Epoch 6
R2 Score: -0.0001
MSE: 651.0419

Epoch 7
R2 Score: -0.0112
MSE: 658.2241

Epoch 8
R2 Score: -0.0168
MSE: 661.8770

Epoch 9
R2 Score: -0.0300
MSE: 670.4484

Epoch 10
R2 Score: -0.0345
MSE: 673.4127

