In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Disease_symptom_and_patient_profile_dataset.csv')
print(df.head())

       Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level Outcome Variable  
0            Low            Normal         Positive  
1         Normal            Normal         Negative  
2         Normal            Normal         Negative  
3         Normal            Normal         Positive  
4         Normal            Normal         Positive  


In [5]:
print(df.columns)

Index(['Disease', 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age',
       'Gender', 'Blood Pressure', 'Cholesterol Level', 'Outcome Variable'],
      dtype='object')


In [7]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
label_encoders = {}
for column in ['Disease', 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Gender', 'Blood Pressure', 'Cholesterol Level', 'Outcome Variable']:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

# Verify the DataFrame after encoding
print(df.head())
print(df.columns)

   Disease  Fever  Cough  Fatigue  Difficulty Breathing  Age  Gender  \
0       56      1      0        1                     1   19       0   
1       24      0      1        1                     0   25       0   
2       37      0      1        1                     0   25       0   
3        6      1      1        0                     1   25       1   
4        6      1      1        0                     1   25       1   

   Blood Pressure  Cholesterol Level  Outcome Variable  
0               1                  2                 1  
1               2                  2                 0  
2               2                  2                 0  
3               2                  2                 1  
4               2                  2                 1  
Index(['Disease', 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age',
       'Gender', 'Blood Pressure', 'Cholesterol Level', 'Outcome Variable'],
      dtype='object')


In [9]:
X = df.drop('Outcome Variable', axis=1)
y = df['Outcome Variable']

In [11]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [17]:
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.8428571428571429
Precision: 0.8717948717948718
Recall: 0.85
F1 Score: 0.8607594936708861


In [21]:
new_data = {
    'Disease': 'Influenza',
    'Fever': 'Yes',
    'Cough': 'No',
    'Fatigue': 'Yes',
    'Difficulty Breathing': 'Yes',
    'Age': 20,
    'Gender': 'Female',
    'Blood Pressure': 'Low',
    'Cholesterol Level': 'Normal'}

In [23]:
new_data_encoded = {}
for column, value in new_data.items():
    if column in label_encoders:
        new_data_encoded[column] = label_encoders[column].transform([value])[0]
    else:
        new_data_encoded[column] = value

# Convert the encoded new data into a DataFrame with the same columns as X_train
new_data_df = pd.DataFrame([new_data_encoded], columns=X.columns)

# Make a prediction
prediction = model.predict(new_data_df)
# Decode the prediction
prediction_decoded = label_encoders['Outcome Variable'].inverse_transform(prediction)
print(f'Prediction: {prediction_decoded[0]}')

Prediction: Positive


In [25]:
new_data = {
    'Disease': 'Common Cold',
    'Fever': 'No',
    'Cough': 'Yes',
    'Fatigue': 'Yes',
    'Difficulty Breathing': 'No',
    'Age': 25,
    'Gender': 'Female',
    'Blood Pressure': 'Normal',
    'Cholesterol Level': 'Normal'
}


In [27]:
new_data_encoded = {}
for column, value in new_data.items():
    if column in label_encoders:
        new_data_encoded[column] = label_encoders[column].transform([value])[0]
    else:
        new_data_encoded[column] = value

# Convert the encoded new data into a DataFrame with the same columns as X_train
new_data_df = pd.DataFrame([new_data_encoded], columns=X.columns)

# Make a prediction
prediction = model.predict(new_data_df)
# Decode the prediction
prediction_decoded = label_encoders['Outcome Variable'].inverse_transform(prediction)
print(f'Prediction: {prediction_decoded[0]}')

Prediction: Negative
