In [12]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

In [13]:
entries = Path("D:\Transcend_(E)\_IramS\Covid Challenge")
os.chdir(entries)
train_data = pd.read_table('trainSet.txt', delimiter=',', header='infer')
train_data.head()

Unnamed: 0,PatientID,ImageFile,Hospital,Age,Sex,Temp_C,Cough,DifficultyInBreathing,WBC,CRP,Fibrinogen,LDH,Ddimer,Ox_percentage,PaO2,SaO2,pH,CardiovascularDisease,RespiratoryFailure,Prognosis
0,P_131,P_131.png,D,35.913889,0,39.3,1.0,0.0,5.76,43.4,651.0,387.0,157.0,94.0,,,,0.0,,MILD
1,P_132,P_132.png,D,57.266667,0,37.0,0.0,0.0,11.48,64.0,700.0,338.0,601.0,94.0,75.0,96.9,7.42,0.0,,MILD
2,P_195,P_195.png,D,79.263889,0,37.8,1.0,0.0,6.21,115.3,698.0,356.0,448.0,94.0,63.0,94.6,7.39,1.0,,SEVERE
3,P_193,P_193.png,D,82.0,0,38.0,1.0,0.0,7.28,149.3,513.0,482.0,,97.0,68.0,96.3,7.46,0.0,,SEVERE
4,P_140,P_140.png,D,60.791667,1,37.0,1.0,0.0,6.37,20.7,,,210.0,93.0,,97.3,,0.0,,MILD


In [14]:
columns = train_data.keys()
columns = list(columns)
print(columns)

['PatientID', 'ImageFile', 'Hospital', 'Age', 'Sex', 'Temp_C', 'Cough', 'DifficultyInBreathing', 'WBC', 'CRP', 'Fibrinogen', 'LDH', 'Ddimer', 'Ox_percentage', 'PaO2', 'SaO2', 'pH', 'CardiovascularDisease', 'RespiratoryFailure', 'Prognosis']


In [15]:
columns.remove('ImageFile')
columns.remove('PatientID')
columns.remove('Prognosis')
columns.remove('Hospital')
# Get the total classes
print(f"There are {len(columns)} columns of Features: {columns}")

There are 16 columns of Features: ['Age', 'Sex', 'Temp_C', 'Cough', 'DifficultyInBreathing', 'WBC', 'CRP', 'Fibrinogen', 'LDH', 'Ddimer', 'Ox_percentage', 'PaO2', 'SaO2', 'pH', 'CardiovascularDisease', 'RespiratoryFailure']


In [16]:
# Binary data
columns_binary= ['Sex','Cough', 'CardiovascularDisease','DifficultyInBreathing','RespiratoryFailure']

In [17]:
for column_binary in columns_binary:
    print(f"The class {column_binary} has {train_data[column_binary].sum()} samples")

The class Sex has 291 samples
The class Cough has 436.0 samples
The class CardiovascularDisease has 234.0 samples
The class DifficultyInBreathing has 427.0 samples
The class RespiratoryFailure has 11.0 samples


In [18]:
# Numerical data 
from collections import Counter
columns_numerical = list((Counter(columns)-Counter(columns_binary)).elements())

In [19]:
for column_numerical in columns_numerical:
    print(f"The class {column_numerical} has {train_data[column_numerical].count()} samples")

The class Age has 862 samples
The class Temp_C has 709 samples
The class WBC has 854 samples
The class CRP has 830 samples
The class Fibrinogen has 272 samples
The class LDH has 727 samples
The class Ddimer has 242 samples
The class Ox_percentage has 620 samples
The class PaO2 has 693 samples
The class SaO2 has 280 samples
The class pH has 656 samples


In [20]:
train_df=train_data.drop(['ImageFile','PatientID','Hospital','Prognosis'],axis=1) # No imputation required 
# transform string output to binary 'MILD=0'
lables= pd.Series(np.where(train_data.Prognosis.values == 'MILD', 0, 1),train_data.index)

In [22]:
X= train_df    #X= dataset.iloc[:,1:47](when i have SO features as well) # Get features data in pd frame 
y= lables  # Get lables data in pd frame 

In [41]:
def impute_data(X_train, X_test):
    knn_imputer = KNNImputer(n_neighbors=20) # change value of k wihtin cv folds as long acurracy increases
    knn_imputer.fit(X_train)
    train_imputed = knn_imputer.transform(X_train)
    test_imputed = knn_imputer.transform(X_test)
    X_train_imputed = pd.DataFrame(train_imputed, columns = X_train.columns, index=X_train.index)
    X_test_imputed = pd.DataFrame(test_imputed, columns = X_train.columns, index=X_test.index)
    return X_train_imputed, X_test_imputed
    

In [42]:
def scaling_data(X_train, X_test):
    
    scaler= MinMaxScaler()
    #scaler = StandardScaler()
    scaler.fit(X_train)# fit to train 
    X_train_scaled= scaler.transform(X_train)# transform train
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = X_train.columns, index=X_train.index) #convert train to dataframe 
    X_test_scaled= scaler.transform(X_test) #transform test
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = X_test.columns, index=X_test.index) #convert test to dataframe 
    
    return X_train_scaled_df, X_test_scaled_df

In [43]:
def feature_selection(X_train,y_train,X_test): 

    fs= SelectKBest(score_func=mutual_info_classif, k=8)
    fs.fit(X_train, y_train)
    X_train_feat= fs.transform(X_train)
    X_test_feat= fs.transform(X_test)
    mask = fs.get_support() #get a mask of selected features
    sig_features = X_train.columns[mask] # get feature names
    X_train_fs_df = pd.DataFrame(X_train_feat, columns =sig_features.tolist(), index=X_train.index) #convert train to dataframe
    X_test_fs_df = pd.DataFrame(X_test_feat, columns =sig_features.tolist(), index=X_test.index) #convert train to dataframe 
    return X_train_fs_df, X_test_fs_df, sig_features.tolist()

In [44]:
def fit_compute_results(X_train, y_train, X_test, y_test):
        
        model = LogisticRegression(solver='lbfgs',penalty='none',class_weight='balanced')
        model.fit(X_train, y_train)
    

        # Getting Prediction Probabilties for Each Repetition on Training and Test Set of Selected Features
        proba_train= model.predict_proba(X_train)
        proba_test= model.predict_proba(X_test)
        
    
        # Getting Train and Test Probabilites for positive class to calculate train and test auc 
        yhat_train= proba_train[:,1]
        yhat_test= proba_test[:,1]
        

        #Getting Train and Test AUC
        auc_train  = metrics.roc_auc_score(y_train, yhat_train)
        auc_test= metrics.roc_auc_score(y_test, yhat_test)
        

        return auc_train,auc_test

In [45]:
kfold  = RepeatedStratifiedKFold(n_splits=3, n_repeats=33,random_state=0)
sig_feat_count=[]
ROC_train= []
ROC_test= []

In [46]:
for train, test in kfold.split(X,y):
    
    # Scaling features
    X_train_scaled_df, X_test_scaled_df = scaling_data(X.iloc[train.tolist()], X.iloc[test.tolist()])
    
    # Impute features 
    X_train_imputed_df, X_test_imputed_df = impute_data(X_train_scaled_df, X_test_scaled_df)
    
    # Selecting k-best features 
    X_train_fs, X_test_fs, final_feature_list= feature_selection(X_train_imputed_df,y[train],X_test_imputed_df)
    #Store filter selected feature in bootstraps of each fold
    sig_feat_count.append(final_feature_list)
        
    # Fitting Model on Single Feature in Training Batch
    auc_train, auc_test= fit_compute_results(X_train_fs,y[train],X_test_fs,y[test])
    ROC_train.append(auc_train)
    ROC_test.append(auc_test)

In [47]:
print('Train Accuray Median: %.2f%%' % (np.median(ROC_train)*100))
print('Test Accuracy Median: %.2f%%' % (np.median(ROC_test)*100))

Train Accuray Median: 83.35%
Test Accuracy Median: 82.24%


In [48]:
feat_count=pd.DataFrame(sig_feat_count).apply(pd.Series.value_counts).sum(axis=1) #sum occurence  of each features
final_feat_list= (feat_count[(feat_count >=40 )].index).tolist() # get features that have occurence above thresh

In [49]:
feat_count.sort_values(ascending=False) # most important features can be identified by their count in each cv fold 

LDH                      99.0
Ox_percentage            99.0
PaO2                     98.0
CRP                      97.0
SaO2                     96.0
Ddimer                   82.0
Age                      65.0
Fibrinogen               64.0
DifficultyInBreathing    30.0
WBC                      23.0
pH                       14.0
CardiovascularDisease     7.0
Sex                       6.0
RespiratoryFailure        5.0
Temp_C                    5.0
Cough                     2.0
dtype: float64