In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('lung cancer survey.csv')
df_no_na = df.dropna()

df_age = df_no_na[df_no_na["AGE"] > 21]
# Use df_age for all models, where clustering algorithm models, such as Kmeans, randomforest, decision tree as they would have already categorise an age threshold within the model
df_age

# Use df_cluster only for logistic regression, lasso, ridge and elastic net - cross comparison of these models with df_age
df_cluster = df_age.copy()
df_cluster['cluster'] = df_cluster['AGE'].apply(lambda x: 1 if x >= 61 else 0)
df_cluster = df_cluster.drop("AGE", axis = 1)
df_cluster

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,0.0,61.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,70.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
2,1.0,59.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,54.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
4,0.0,54.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8996,1.0,62.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
8997,0.0,71.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
8998,1.0,63.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
8999,1.0,70.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


Unnamed: 0,GENDER,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER,cluster
0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
1,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8996,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1
8997,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1
8998,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
8999,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1


### model training

### non-PCA

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

X = df_age[['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN']].values
y = df_age['LUNG_CANCER'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=999)

# Standardize the data (since KNN is sensitive to scale)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

k = 3  # Number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Fit the model
knn_classifier.fit(X_train, y_train)

# Predict on the validation set
y_pred = knn_classifier.predict(X_val)

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Display the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.84
Precision: 0.86
Recall: 0.95
F1 Score: 0.90


#### try to beat baseline above

In [21]:
##### maximise accuracy

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=999)

# Create a pipeline to standardize the data and apply KNeighborsClassifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardizes the data
    ('knn', KNeighborsClassifier())  # KNN Classifier without setting neighbors yet
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'knn__n_neighbors': list(range(1, 21))  # Search for the best k in the range 1 to 20
}

# Define the custom K-Fold cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=999)  # 5-fold CV with shuffling

# Initialize GridSearchCV with the custom KFold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='accuracy')

# Fit the GridSearchCV to find the best k
grid_search.fit(X_train, y_train)

# Get the best parameter for k
best_k = grid_search.best_params_['knn__n_neighbors']

# Retrieve best model
best_knn_classifier =  grid_search.best_estimator_

# Use best model to predict on the validation set
y_pred = best_knn_classifier.predict(X_val)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Display the results
print(f"Optimal number of neighbors (k): {best_k}")
print(f"Accuracy on validation set with k={best_k}: {accuracy:.2f}")
print(f"Precision on validation set with k={best_k}: {precision:.2f}")
print(f"Recall on validation set with k={best_k}: {recall:.2f}")
print(f"F1 Score on validation set with k={best_k}: {f1:.2f}")

Optimal number of neighbors (k): 14
Accuracy on validation set with k=14: 0.86
Precision on validation set with k=14: 0.86
Recall on validation set with k=14: 0.99
F1 Score on validation set with k=14: 0.92


##### maximise recall

In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=999)

# Create a pipeline to standardize the data and apply KNeighborsClassifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardizes the data
    ('knn', KNeighborsClassifier())  # KNN Classifier without setting neighbors yet
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'knn__n_neighbors': list(range(1, 21))  # Search for the best k in the range 1 to 20
}

# Define the custom K-Fold cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=999)  # 5-fold CV with shuffling

# Initialize GridSearchCV with the custom KFold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='recall')

# Fit the GridSearchCV to find the best k
grid_search.fit(X_train, y_train)

# Get the best parameter for k
best_k = grid_search.best_params_['knn__n_neighbors']

# Retrieve best model
best_knn_classifier =  grid_search.best_estimator_

# Use best model to predict on the validation set
y_pred = best_knn_classifier.predict(X_val)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Display the results
print(f"Optimal number of neighbors (k): {best_k}")
print(f"Accuracy on validation set with k={best_k}: {accuracy:.2f}")
print(f"Precision on validation set with k={best_k}: {precision:.2f}")
print(f"Recall on validation set with k={best_k}: {recall:.2f}")
print(f"F1 Score on validation set with k={best_k}: {f1:.2f}")

Optimal number of neighbors (k): 19
Accuracy on validation set with k=19: 0.85
Precision on validation set with k=19: 0.85
Recall on validation set with k=19: 1.00
F1 Score on validation set with k=19: 0.91


In [7]:
#### maximise f1

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=999)

# Create a pipeline to standardize the data and apply KNeighborsClassifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardizes the data
    ('knn', KNeighborsClassifier())  # KNN Classifier without setting neighbors yet
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'knn__n_neighbors': list(range(1, 21))  # Search for the best k in the range 1 to 20
}

# Define the custom K-Fold cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=999)  # 5-fold CV with shuffling

# Initialize GridSearchCV with the custom KFold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='f1')

# Fit the GridSearchCV to find the best k
grid_search.fit(X_train, y_train)

# Get the best parameter for k
best_k = grid_search.best_params_['knn__n_neighbors']

# Retrieve best model
best_knn_classifier =  grid_search.best_estimator_

# Use best model to predict on the validation set
y_pred = best_knn_classifier.predict(X_val)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Display the results
print(f"Optimal number of neighbors (k): {best_k}")
print(f"Accuracy on validation set with k={best_k}: {accuracy:.2f}")
print(f"Precision on validation set with k={best_k}: {precision:.2f}")
print(f"Recall on validation set with k={best_k}: {recall:.2f}")
print(f"F1 Score on validation set with k={best_k}: {f1:.2f}")

Optimal number of neighbors (k): 14
Accuracy on validation set with k=14: 0.86
Precision on validation set with k=14: 0.86
Recall on validation set with k=14: 0.99
F1 Score on validation set with k=14: 0.92


Recall (Sensitivity):
Reason: In medical diagnostics, especially for severe conditions like cancer, false negatives (missing a cancer diagnosis) can have serious consequences. Prioritizing recall ensures that more true positives are detected, meaning fewer cancer cases are missed.

In [9]:
## accuracy increase by 0.02,precision remained, recall increased by 0.04, f1-score increased by 0.02

In [23]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=999)

# Create a pipeline to standardize the data, apply PCA, and then KNeighborsClassifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('knn', KNeighborsClassifier()) 
])

param_grid = {
    'pca__n_components': list(range(1,15)),
    'knn__n_neighbors': list(range(1, 21))  # Search for the best k in the range 1 to 20
}


kf = KFold(n_splits=5, shuffle=True, random_state=999)  # 5-fold CV with shuffling

# Initialize GridSearchCV with the custom KFold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='accuracy')

# Fit the GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters for k and number of components
best_k = grid_search.best_params_['knn__n_neighbors']
best_n_components = grid_search.best_params_['pca__n_components']

# Retrieve the best model
best_knn_classifier = grid_search.best_estimator_

# Use the best model to predict on the validation set
y_pred = best_knn_classifier.predict(X_val)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Display the results
print(f"Optimal number of neighbors (k): {best_k}")
print(f"Optimal number of PCA components: {best_n_components}")
print(f"Accuracy on validation set with k={best_k}, n_components={best_n_components}: {accuracy:.2f}")
print(f"Precision on validation set: {precision:.2f}")
print(f"Recall on validation set: {recall:.2f}")
print(f"F1 Score on validation set: {f1:.2f}")

  _data = np.array(data, dtype=dtype, copy=copy,


Optimal number of neighbors (k): 16
Optimal number of PCA components: 14
Accuracy on validation set with k=16, n_components=14: 0.85
Precision on validation set: 0.85
Recall on validation set: 0.99
F1 Score on validation set: 0.91


In [24]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=999)

# Create a pipeline to standardize the data, apply PCA, and then KNeighborsClassifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('knn', KNeighborsClassifier()) 
])

param_grid = {
    'pca__n_components': list(range(1,15)),
    'knn__n_neighbors': list(range(1, 21))  # Search for the best k in the range 1 to 20
}


kf = KFold(n_splits=5, shuffle=True, random_state=999)  # 5-fold CV with shuffling

# Initialize GridSearchCV with the custom KFold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='recall')

# Fit the GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters for k and number of components
best_k = grid_search.best_params_['knn__n_neighbors']
best_n_components = grid_search.best_params_['pca__n_components']

# Retrieve the best model
best_knn_classifier = grid_search.best_estimator_

# Use the best model to predict on the validation set
y_pred = best_knn_classifier.predict(X_val)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Display the results
print(f"Optimal number of neighbors (k): {best_k}")
print(f"Optimal number of PCA components: {best_n_components}")
print(f"Accuracy on validation set with k={best_k}, n_components={best_n_components}: {accuracy:.2f}")
print(f"Precision on validation set: {precision:.2f}")
print(f"Recall on validation set: {recall:.2f}")
print(f"F1 Score on validation set: {f1:.2f}")

  _data = np.array(data, dtype=dtype, copy=copy,


Optimal number of neighbors (k): 19
Optimal number of PCA components: 1
Accuracy on validation set with k=19, n_components=1: 0.79
Precision on validation set: 0.79
Recall on validation set: 1.00
F1 Score on validation set: 0.88


In [25]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=999)

# Create a pipeline to standardize the data, apply PCA, and then KNeighborsClassifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('knn', KNeighborsClassifier()) 
])

param_grid = {
    'pca__n_components': list(range(1,15)),
    'knn__n_neighbors': list(range(1, 21))  # Search for the best k in the range 1 to 20
}


kf = KFold(n_splits=5, shuffle=True, random_state=999)  # 5-fold CV with shuffling

# Initialize GridSearchCV with the custom KFold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='f1')

# Fit the GridSearchCV to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters for k and number of components
best_k = grid_search.best_params_['knn__n_neighbors']
best_n_components = grid_search.best_params_['pca__n_components']

# Retrieve the best model
best_knn_classifier = grid_search.best_estimator_

# Use the best model to predict on the validation set
y_pred = best_knn_classifier.predict(X_val)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Display the results
print(f"Optimal number of neighbors (k): {best_k}")
print(f"Optimal number of PCA components: {best_n_components}")
print(f"Accuracy on validation set with k={best_k}, n_components={best_n_components}: {accuracy:.2f}")
print(f"Precision on validation set: {precision:.2f}")
print(f"Recall on validation set: {recall:.2f}")
print(f"F1 Score on validation set: {f1:.2f}")

  _data = np.array(data, dtype=dtype, copy=copy,


Optimal number of neighbors (k): 16
Optimal number of PCA components: 14
Accuracy on validation set with k=16, n_components=14: 0.85
Precision on validation set: 0.85
Recall on validation set: 0.99
F1 Score on validation set: 0.91


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=999)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Apply PCA to reduce dimensionality
pca = PCA(n_components=best_n_components)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

# Build the KNN classifier with the optimal number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=best_k)

# Fit the model to the PCA-transformed training data
knn_classifier.fit(X_train_pca, y_train)

# Predict on the PCA-transformed validation set
y_pred = knn_classifier.predict(X_val_pca)

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Display the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

### Method 1) try to get feature importance, then determine optimal k
### Method 2) will try the other way as well. determine optimal k, then feature importance
### what is permutation importance?  The permutation feature importance is defined to be the decrease in a model score when a single feature value is randomly shuffled 1.

##### Method 1

In [7]:
df_age.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

X = df_age[['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN']]
y = df_age['LUNG_CANCER'].values

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=888)

# Fit the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Apply permutation importance
result = permutation_importance(knn, X_test, y_test, n_repeats=10, random_state=888)

# Display importance
importance = result.importances_mean
print(importance)

[ 0.01772222  0.00994444  0.00655556  0.03572222 -0.00466667  0.00205556
 -0.00116667  0.00138889  0.005       0.01094444  0.03194444 -0.00094444
  0.00077778  0.00833333 -0.00244444]


In [17]:
feature_names = ['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN']

df_feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
})

# Display the DataFrame
df_feature_importance.sort_values(by = 'Importance', ascending = False)

Unnamed: 0,Feature,Importance
3,YELLOW_FINGERS,0.035722
10,ALCOHOL CONSUMING,0.031944
0,GENDER,0.017722
9,WHEEZING,0.010944
1,AGE,0.009944
13,SWALLOWING DIFFICULTY,0.008333
2,SMOKING,0.006556
8,ALLERGY,0.005
5,PEER_PRESSURE,0.002056
7,FATIGUE,0.001389


##### Order of importance: yellow fingers, alcohol consuming, gender, wheezing, age
kiv swallowing difficulty, smoking, allergy, peer pressure, fatigue, shortness of breath
negative variables: coughing, chronic disease, chest pain, anxiety, can try to exclude these first, might be negatively affecting model accuracy