In [2]:
# import required libraries
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb 
from sklearn.metrics import accuracy_score, classification_report, accuracy_score, precision_score, recall_score, f1_score  

In [3]:
# load the dataframe 
df = pd.read_csv('severity_symptoms_asthma.csv')

In [19]:
# check data 
df

Unnamed: 0.1,Unnamed: 0,Patient_ID,Age,Gender,Smoking_Status,Asthma_Diagnosis,Medication,Peak_Flow,Symptoms,Severity
0,0,1,26,Female,Non-Smoker,Yes,Inhaler,175,Wheezing & Shortness of Breath,Severe
1,1,2,52,Female,Ex-Smoker,No,,156,Wheezing & Shortness of Breath,
2,2,3,56,Female,Ex-Smoker,Yes,Inhaler,236,Occasional Coughing,Moderate
3,3,4,55,Male,Current Smoker,Yes,Controller Medication,378,Occasional Coughing,Severe
4,4,5,43,Female,Current Smoker,No,,159,Wheezing & Shortness of Breath,
...,...,...,...,...,...,...,...,...,...,...
295,295,296,25,Female,Current Smoker,No,,283,No Symptoms,
296,296,297,20,Male,Ex-Smoker,No,,202,Occasional Coughing,
297,297,298,25,Female,Non-Smoker,No,,208,Occasional Coughing,
298,298,299,18,Male,Ex-Smoker,No,,175,Wheezing & Shortness of Breath,


In [4]:
# dropping unecessary columns
df = df.drop(columns=['Unnamed: 0', 'Patient_ID', 'Medication'])

In [5]:
df.isnull().sum()

Age                   0
Gender                0
Smoking_Status        0
Asthma_Diagnosis      0
Peak_Flow             0
Symptoms              0
Severity            143
dtype: int64

In [6]:
# since we have null values in only severity column which represents no severity I am assuming it as low
# so I replaced it with low
df = df.fillna('low')
df.isnull().sum()

Age                 0
Gender              0
Smoking_Status      0
Asthma_Diagnosis    0
Peak_Flow           0
Symptoms            0
Severity            0
dtype: int64

Here data are encoded manually

In [22]:
df['Gender_encoded'] = df['Gender']
df['Gender_encoded'] = df['Gender_encoded'].replace({'Female':0, 'Male':1})

In [23]:
df['Smoking_Status_encoded'] = df['Smoking_Status']
df['Smoking_Status_encoded'] = df['Smoking_Status_encoded'].replace({'Non-Smoker':0, 'Ex-Smoker':1, 'Current Smoker':2})

In [24]:
df['Asthma_Diagnosis_encoded'] = df['Asthma_Diagnosis']
df['Asthma_Diagnosis_encoded'] = df['Asthma_Diagnosis_encoded'].replace({'Yes':3, 'No':0})

In [25]:
df['Symptoms_encoded'] = df['Symptoms']
df['Symptoms_encoded'] = df['Symptoms_encoded'].replace({'No Symptoms':2,'Wheezing & Shortness of Breath':5, 'Occasional Coughing':0, 'Shortness of Breath & Fatigue':4})

In [26]:
df['Severity_encoded'] = df['Severity']
df['Severity_encoded'] = df['Severity_encoded'].replace({'Severe':0, 'Moderate':2, 'low':1})

In [12]:
# columns are arrenged 
df = df[['Age', 'Gender_encoded', 'Smoking_Status_encoded', 'Asthma_Diagnosis_encoded', 'Symptoms_encoded', 'Severity_encoded','Peak_Flow']]

Training the model 

In [13]:
X = df[['Age','Gender_encoded', 'Smoking_Status_encoded', 'Asthma_Diagnosis_encoded', 'Symptoms_encoded','Peak_Flow']]
y = df['Severity_encoded']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42) 

In [36]:
 model = xgb.XGBClassifier(
    eval_metric = 'mlogloss',
    random_state = 42
)

In [37]:
model.fit(X_train,y_train)

In [39]:
y_pred = model.predict(X_test)
y_pred

Checking accuracy of model 

In [47]:
accuracy = accuracy_score(y_test, y_pred)
print('accuracy', accuracy)
print(classification_report(y_test, y_pred))

accuracy 0.85
              precision    recall  f1-score   support

           0       0.83      0.80      0.82        25
           1       1.00      1.00      1.00        28
           2       0.38      0.43      0.40         7

    accuracy                           0.85        60
   macro avg       0.74      0.74      0.74        60
weighted avg       0.86      0.85      0.85        60



In [48]:
actual_labels = np.where(y_test == 0,1,0) 
predicted_labels = np.where(y_pred == 0,1,0) 

In [49]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
tn, fp, fn, tp = confusion_matrix(actual_labels, predicted_labels).ravel()

In [50]:
print(tn)
print(fn)

31
5


In [53]:
ppv = tp / (tp + fp) if (tp + fp) != 0 else 0  # Avoid division by zero
npv = tn / (tn + fn) if (tn + fn) != 0 else 0  # Avoid division by zero

In [54]:
print(ppv, npv)

0.8333333333333334 0.8611111111111112


In [55]:
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

In [56]:
print('sensitivity: ', sensitivity)
print('specificity: ', specificity)

sensitivity:  0.8
specificity:  0.8857142857142857


In [57]:
from sklearn.model_selection import cross_val_predict 

cv_scores = cross_val_predict(model, X_train, y_train, cv=5) 


In [58]:
print(np.mean(cv_scores))

0.7208333333333333


In [108]:
# testing data 
X_test

Unnamed: 0,Age,Gender_encoded,Smoking_Status_encoded,Asthma_Diagnosis_encoded,Symptoms_encoded
241,22,1,2,0,1
92,38,0,1,0,1
252,54,0,2,0,2
262,59,1,1,0,1
211,23,1,0,0,1
65,47,1,0,0,1
186,24,0,2,0,1
189,55,1,0,0,1
185,28,1,2,0,1
263,47,0,0,0,1


In [63]:
# making list of output y_pred 
l = [] 
for i in y_pred: 
    l.append(i)

In [64]:
test_data = X_test[['Symptoms_encoded', 'Gender_encoded', 'Age', 'Asthma_Diagnosis_encoded']]

In [65]:
# adding output y_pred to test data for giving input to treatment data.
test_data['Severity_encoded'] = l

In [66]:
test_data

Unnamed: 0,Symptoms_encoded,Gender_encoded,Age,Asthma_Diagnosis_encoded,Severity_encoded
203,5,1,20,0,1
266,0,0,43,3,0
152,2,0,31,0,1
9,2,1,27,0,1
233,0,1,30,3,2
226,2,0,61,0,1
196,2,0,26,0,1
109,5,1,52,0,1
5,2,1,23,0,1
175,2,0,41,0,1


In [67]:
# loading data as csv file 
test_data.to_csv('newest_new_feeding_data.csv')

Checking p-values of dataset 

In [None]:
import pandas as pd
import scipy.stats as stats

group1 = df[df["Asthma_Diagnosis"] == "Yes"]["Age"]
group2 = df[df["Asthma_Diagnosis"] == "No"]["Age"]

t_stat, p_value_ttest = stats.ttest_ind(group1, group2, equal_var=False)
print(f"T-Test: P-Value for Age vs. Asthma Diagnosis = {p_value_ttest}")

In [None]:
###  ANOVA: Compare Peak_Flow across Asthma Diagnosis categories**
moderate = df[df["Severity"] == "Moderate"]["Peak_Flow"]
severe = df[df["Severity"] == "Severe"]["Peak_Flow"]

f_stat, p_value_anova = stats.f_oneway(moderate, severe)
print(f"ANOVA: P-Value for Peak_Flow vs. Severity = {p_value_anova}")

In [None]:
### ** ANOVA: Compare Peak_Flow across Asthma Diagnosis categories**
ex = df[df["Smoking_Status"] == "Ex-Smoker"]["Peak_Flow"]
curr = df[df["Smoking_Status"] == "Current Smoker"]["Peak_Flow"]
non = df[df["Smoking_Status"] == "Non-Smoker"]["Peak_Flow"]

f_stat, p_value_anova = stats.f_oneway(ex,curr,non)
print(f"ANOVA: P-Value for Peak_Flow vs. Smoking_status = {p_value_anova}")

In [None]:
contingency_table = pd.crosstab(df["Gender"], df["Asthma_Diagnosis"])
chi2_stat, p_value_chi2, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-Square: P-Value for Gender vs. Asthma_Diagnosis = {p_value_chi2}")

In [None]:
# loadig ml models in pkl format

In [59]:
import pickle 
with open('xgboost_model_severity_pred2.pkl', 'wb') as f:
pickle.dump(model, f)

In [60]:
with open('xgboost_model_severity_pred2.pkl', 'rb') as f: 
    loaded_model = pickle.load(f)