In [None]:
!pip install xgboost

In [5]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb 
from sklearn.metrics import accuracy_score, classification_report, accuracy_score, precision_score, recall_score, f1_score  

In [6]:
df = pd.read_csv('respiratory symptoms and treatment.csv')

In [14]:
df.isnull().sum()

Symptoms     0
Age          0
Sex          0
Disease      0
Treatment    0
Nature       0
dtype: int64

In [13]:
df = df.dropna()

In [6]:
categorical_cols = ['Symptoms', 'Sex', 'Disease', 'Treatment', 'Nature']

In [7]:

encoders = {} 
for col in categorical_cols: 
    encoder = LabelEncoder() 
    df[col] = df[col].astype(str)  # Convert to string to handle NaNs as 'nan'
    df[col + "_encoded"] = encoder.fit_transform(df[col])
    encoders[col] = encoder  # Store encoder for inverse transformation

In [8]:
df.sample(10)

Unnamed: 0,Symptoms,Age,Sex,Disease,Treatment,Nature,Symptoms_encoded,Sex_encoded,Disease_encoded,Treatment_encoded,Nature_encoded
17435,shortness of breath,50.0,male,Mesothelioma,consult a doctor,high,79,1,7,16,0
7955,chest pain,11.0,male,Pneumonia,Antibiotic,medium,46,1,8,1,2
4445,whistling sound while breathing,10.0,male,bronchitis,antibiotics,medium,87,1,14,14,2
20357,loss of appetite,56.0,female,bronchiolitis,saline nose drops,high,66,0,13,28,0
16059,Rapid breathing,63.0,female,Pneumothorax,isotonic sodium chloride solution,high,33,0,9,22,0
3236,Fever,51.0,male,Pneumonia,aspirin,medium,20,1,8,15,2
27874,sharp chest pain,67.0,male,Pneumothorax,isotonic sodium chloride solution,high,76,1,9,22,0
25634,cold,17.0,male,Pneumothorax,isotonic sodium chloride solution,high,48,1,9,22,0
23749,Fatigue,88.0,male,chronic obstructive pulmonary disease,oxyzen,high,17,1,15,25,0
32843,Wider and rounder than normal fingertips and toes,15.0,male,Asbestosis,stay away from cold places,high,43,1,1,29,0


In [9]:
X = df[['Symptoms_encoded', 'Sex_encoded', 'Age', 'Disease_encoded', 'Nature_encoded']]
y = df['Treatment_encoded']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# initialize and train XGBoost classifier 
model = xgb.XGBClassifier( 
    eval_metric = 'mlogloss', 
    random_state = 42
)

In [None]:
model.fit(X_train, y_train) 

In [63]:
y_pred = model.predict(X_test)
y_pred 

In [None]:
# Convert predictions back to original category
test_data_prediction = pd.DataFrame({
    'Predicted_Treatment_Encoded': predictions,
    'original_treatments': encoders['Treatment'].inverse_transform(predictions)
})
test_data_prediction

In [None]:
# load new unseen data which are generated from the severity prediction model  
unseen_data = pd.read_csv('newest_new_feeding_data.csv')

In [None]:
# changing name of cols for this model 
unseen_data['Disease_encoded'] = test_data['Asthma_Diagnosis_encoded']
unseen_data['Sex_encoded'] = test_data['Gender_encoded']
unseen_data['Nature_encoded'] = test_data['Severity_encoded']

In [None]:
unseen = unseen_data[['Symptoms_encoded', 'Sex_encoded', 'Age', 'Disease_encoded', 'Nature_encoded']]

In [None]:
unseen_data = unseen_data.drop(unseen_data[unseen_data['Disease_encoded']==0].index)

In [75]:
unseen_data 

Unnamed: 0,Symptoms_encoded,Sex_encoded,Age,Disease_encoded,Nature_encoded
1,0,0,43,3,0
4,0,1,30,3,2
10,0,0,45,3,2
12,0,1,27,3,0
14,0,0,41,3,0
15,0,1,44,3,2
17,0,1,23,3,0
20,0,0,63,3,0
22,0,0,54,3,0
23,0,0,21,3,0


In [76]:
new_pred = model.predict(unseen_data)

In [72]:
new_pred

array([10,  9, 10,  9, 10,  9,  9,  9, 10, 10,  9,  9, 10,  9, 10,  5, 10,
        9, 10, 10, 10,  9,  9,  9,  9, 10, 10,  9,  9, 10, 10,  9],
      dtype=int64)

In [None]:
# Convert predictions back to original category
unseen_data_predictions = pd.DataFrame({
    'Predicted_Treatment_Encoded': new_pred,
    'original_treatmens': encoders['Treatment'].inverse_transform(new_pred)
})
unseen_data_predictions

In [51]:
accuracy = accuracy_score(y_test, new_pred) 
classification_rep = classification_report(y_test, new_pred) 

In [52]:
print('accuracy - ',accuracy) 
print('classification_rep - ',classification_rep)  

accuracy -  0.9903743315508021
classification_rep -                precision    recall  f1-score   support

           0       1.00      1.00      1.00       151
           1       0.98      1.00      0.99       675
           2       1.00      1.00      1.00         8
           3       1.00      0.88      0.94       118
           4       1.00      1.00      1.00       577
           5       1.00      1.00      1.00       186
           6       1.00      1.00      1.00       326
           7       1.00      1.00      1.00        62
           8       1.00      1.00      1.00       101
           9       1.00      1.00      1.00        59
          10       1.00      1.00      1.00        52
          11       1.00      1.00      1.00       373
          12       1.00      0.86      0.93       212
          13       0.86      1.00      0.93        57
          14       0.99      1.00      1.00       984
          15       1.00      1.00      1.00        68
          16       0.95     

In [64]:
actual_labels = np.where(y_test == 0,1,0) 
predicted_labels = np.where(y_pred == 0,1,0) 

In [65]:
# checking specificity and sensitivity 

# creating confusion matrix 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
tn, fp, fn, tp = confusion_matrix(actual_labels, predicted_labels).ravel()

In [78]:
ppv = tp / (tp + fp) if (tp + fp) != 0 else 0  # Avoid division by zero
npv = tn / (tn + fn) if (tn + fn) != 0 else 0  # Avoid division by zero

In [79]:
print(ppv, npv)

1.0 1.0


In [66]:
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0 
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

In [67]:
print(sensitivity, specificity)

1.0 1.0


In [69]:
# cross validation 
from sklearn.model_selection import cross_val_predict 

cv_scores = cross_val_predict(model, X_train, y_train, cv=5) 


In [70]:
print(np.mean(cv_scores))

13.595286298177928


In [58]:
import pickle 
with open('xgboost_model_treat_pred.pkl', 'wb') as f: 
    pickle.dump(model, f) 


In [59]:
with open('xgboost_model_treat_pred.pkl', 'rb') as f: 
    loaded_model = pickle.load(f) 


In [60]:
loaded_model

Calcu

In [19]:
df = X

In [36]:
from scipy.stats import chi2_contingency, ttest_ind, f_oneway
def chi_squared_test(df, col1, col2):
    contingency_table = pd.crosstab(df[col1], df[col2])
    chi2, p_val, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi-squared test for {col1} vs {col2}: p-value = {p_val}")

# Function to perform t-test (for comparing means of two groups)
def t_test(df, col, group_col, group1_val, group2_val):
    group1 = df[df[group_col] == group1_val][col]
    group2 = df[df[group_col] == group2_val][col]
    t_stat, p_val = ttest_ind(group1, group2)
    print(f"T-test for {col} between {group_col} == {group1_val} and {group_col} == {group2_val}: p-value = {p_val}")
    print('T-test value: ',t_stat)
# Function to perform ANOVA (for comparing means of multiple groups)
def anova_test(df, col, group_col):
    groups = [df[df[group_col] == val][col] for val in df[group_col].unique()]
    f_stat, p_val = f_oneway(*groups)
    print(f"ANOVA for {col} across different {group_col}: p-value = {p_val}")

In [33]:
chi_squared_test(df, 'Symptoms_encoded', 'Disease_encoded')
chi_squared_test(df, 'Sex_encoded', 'Disease_encoded')
chi_squared_test(df, 'Nature_encoded', 'Disease_encoded')

Chi-squared test for Symptoms_encoded vs Disease_encoded: p-value = 0.0
Chi-squared test for Sex_encoded vs Disease_encoded: p-value = 0.0
Chi-squared test for Nature_encoded vs Disease_encoded: p-value = 0.0


In [37]:
t_test(df, 'Age', 'Sex_encoded', 0, 1)  # Compare Age between Sex_encoded 0 

T-test for Age between Sex_encoded == 0 and Sex_encoded == 1: p-value = 6.491927233883918e-17
T-test value:  -8.360532294866154


In [8]:
df

Unnamed: 0,Symptoms,Age,Sex,Disease,Treatment,Nature
0,coughing,5.0,female,Asthma,Omalizumab,high
1,tight feeling in the chest,4.0,female,Asthma,Mepolizumab,high
2,wheezing,6.0,male,Asthma,Mepolizumab,high
3,shortness of breath,7.0,male,Asthma,Mepolizumab,high
4,shortness of breath,9.0,male,Asthma,Mepolizumab,high
...,...,...,...,...,...,...
38532,A cough that lasts more than three weeks,45.0,female,Tuberculosis,ethambutol,high
38533,Loss of appetite and unintentional weight loss,43.0,female,Tuberculosis,ethambutol,high
38534,Fever,41.0,female,Tuberculosis,ethambutol,high
38535,Chills,53.0,female,Tuberculosis,ethambutol,high


In [15]:
from scipy import stats
### **1️⃣ t-Test: Compare Age between Male & Female**
group1 = df[df["Sex"] == "male"]["Age"]
group2 = df[df["Sex"] == "female"]["Age"]

t_stat, p_value_ttest = stats.ttest_ind(group1, group2, equal_var=False)
print(f"T-Test: P-Value for Age vs. Sex = {p_value_ttest}")

### **2️⃣ ANOVA: Compare Age across different Diseases**
disease_groups = [df[df["Disease"] == disease]["Age"] for disease in df["Disease"].unique()]
f_stat, p_value_anova = stats.f_oneway(*disease_groups)
print(f"ANOVA: P-Value for Age vs. Disease = {p_value_anova}")

### **3️⃣ Chi-Square Test: Relationship between Sex & Disease**
contingency_table = pd.crosstab(df["Sex"], df["Disease"])
chi2_stat, p_value_chi2, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-Square: P-Value for Sex vs. Disease = {p_value_chi2}")

T-Test: P-Value for Age vs. Sex = 3.312426264476545e-17
ANOVA: P-Value for Age vs. Disease = 0.0
Chi-Square: P-Value for Sex vs. Disease = 0.0


In [11]:
print(f"Male count: {len(group1)}, Female count: {len(group2)}")

Male count: 21256, Female count: 15411


In [12]:
print(f"Variance in Male Age: {group1.var()}, Variance in Female Age: {group2.var()}")


Variance in Male Age: 816.7183930264317, Variance in Female Age: 715.2738727026726
