In [188]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Number of samples
n_samples = 50000

# Seed for reproducibility
np.random.seed(42)

# Generate features
age = np.random.randint(18, 90, size=n_samples)
gender = np.random.choice(['Male', 'Female'], size=n_samples,p=[0.9,0.1])
income_level = np.random.choice(['Low', 'Medium', 'High'], size=n_samples,p=[0.6,0.25,.15])
employment_status = np.random.choice(['Employed', 'Unemployed'], size=n_samples,p=[0.7,0.3])
education_level = np.random.choice(['Graduate', 'Post graduation', 'No High School'], size=n_samples,p=[.7,.2,.1])
housing_status = np.random.choice(['Homeless', 'Rent', 'Own'], size=n_samples,p=[0.7,0.2,.1])
neighborhood_safety = np.random.choice(['Safe', 'Unsafe'], size=n_samples,p=[0.9,0.1])
access_to_healthcare = np.random.choice(['Poor', 'Good', 'Excellent'], size=n_samples,p=[0.6,0.2,.2])
social_support = np.random.choice(['Low', 'Moderate', 'High'], size=n_samples,p=[0.65,0.15,.2])
physical_activity_level = np.random.choice(['Low', 'Moderate', 'High'], size=n_samples,p=[.6,.2,.2])
dietary_habits = np.random.choice(['Unbalanced', 'Balanced'], size=n_samples,p=[0.9,0.1])
smoking_status = np.random.choice(['Non-smoker', 'Smoker'], size=n_samples,p=[0.9,0.1])
#alcohol_consumption = np.random.choice(['None', 'Light', 'Moderate', 'Heavy'], size=n_samples,p=[0.4,0.2,.2,.2])
#mental_health_status = np.random.choice(['Poor', 'Fair', 'Good', 'Excellent'], size=n_samples,p=[0.3,0.25,.15,.3])

# Generate target variable (disease prediction)
disease = np.random.choice(['Disease_A', 'Disease_B'], size=n_samples, p=[0.8,0.2])

# Create DataFrame
data = pd.DataFrame({
    'age': age,
    'gender': gender,
    'income_level': income_level,
    'employment_status': employment_status,
    'education_level': education_level,
    'housing_status': housing_status,
    'neighborhood_safety': neighborhood_safety,
    'access_to_healthcare': access_to_healthcare,
    'social_support': social_support,
    'physical_activity_level': physical_activity_level,
    'dietary_habits': dietary_habits,
    'smoking_status': smoking_status,
    #'alcohol_consumption': alcohol_consumption,
    #'mental_health_status': mental_health_status,
    'disease': disease
})

print(data.head())


   age gender income_level employment_status  education_level housing_status  \
0   69   Male         High          Employed  Post graduation       Homeless   
1   32   Male          Low        Unemployed         Graduate       Homeless   
2   89   Male          Low          Employed   No High School       Homeless   
3   78   Male          Low          Employed         Graduate            Own   
4   38   Male          Low          Employed  Post graduation            Own   

  neighborhood_safety access_to_healthcare social_support  \
0                Safe                 Good       Moderate   
1                Safe                 Poor            Low   
2                Safe                 Poor            Low   
3                Safe                 Poor       Moderate   
4                Safe                 Good            Low   

  physical_activity_level dietary_habits smoking_status    disease  
0                     Low     Unbalanced     Non-smoker  Disease_A  
1             

In [189]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])
    joblib.dump(data[column], 'label_encoders_'+ column + '.pkl')

# Scale numerical features
scaler = StandardScaler()
data[['age']] = scaler.fit_transform(data[['age']])

print(data.head())


        age  gender  income_level  employment_status  education_level  \
0  0.750106       1             0                  0                2   
1 -1.028843       1             1                  1                0   
2  1.711700       1             1                  0                1   
3  1.182823       1             1                  0                0   
4 -0.740365       1             1                  0                2   

   housing_status  neighborhood_safety  access_to_healthcare  social_support  \
0               0                    0                     1               2   
1               0                    0                     2               1   
2               0                    0                     2               1   
3               1                    0                     2               2   
4               1                    0                     1               1   

   physical_activity_level  dietary_habits  smoking_status  disease  
0         

In [190]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split the data into features and target
X = data.drop('disease', axis=1)
y = data['disease']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [191]:

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=200,max_depth=10,random_state=42)
clf.fit(X_train, y_train)

In [192]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoders['disease'].classes_)

print("Accuracy:", accuracy)
#print("Classification Report:\n", report)


Accuracy: 0.7998


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [193]:
import joblib

# Save the trained model
joblib.dump(clf, 'rf_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [68]:
pwd

'/Users/anubhav/Downloads/Final report'

In [199]:
df.head(2)

Unnamed: 0,age,gender,income_level,education_level,employment_status,housing_status,clinical_notes,disease_class
0,69,Male,Low,No High School,Employed,Homeless,Healthy,Disease_A
1,32,Other,Low,PhD,Unemployed,Rent,Minor Issues,Disease_B


In [205]:
df[['gender']].values

array([['Male'],
       ['Other'],
       ['Other'],
       ...,
       ['Female'],
       ['Other'],
       ['Other']], dtype=object)