In [15]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, RFECV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn import metrics
from matplotlib import pyplot
from sklearn.preprocessing import PowerTransformer, Normalizer , StandardScaler, MinMaxScaler
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

In [16]:
entries = Path("D:\Transcend_(E)\_IramS\Covid Challenge")
os.chdir(entries)

clinical_data = pd.read_table('trainSet.txt', delimiter=',', header='infer')
clinical_data.head()

Unnamed: 0,PatientID,ImageFile,Hospital,Age,Sex,Temp_C,Cough,DifficultyInBreathing,WBC,CRP,Fibrinogen,LDH,Ddimer,Ox_percentage,PaO2,SaO2,pH,CardiovascularDisease,RespiratoryFailure,Prognosis
0,P_131,P_131.png,D,35.913889,0,39.3,1.0,0.0,5.76,43.4,651.0,387.0,157.0,94.0,,,,0.0,,MILD
1,P_132,P_132.png,D,57.266667,0,37.0,0.0,0.0,11.48,64.0,700.0,338.0,601.0,94.0,75.0,96.9,7.42,0.0,,MILD
2,P_195,P_195.png,D,79.263889,0,37.8,1.0,0.0,6.21,115.3,698.0,356.0,448.0,94.0,63.0,94.6,7.39,1.0,,SEVERE
3,P_193,P_193.png,D,82.0,0,38.0,1.0,0.0,7.28,149.3,513.0,482.0,,97.0,68.0,96.3,7.46,0.0,,SEVERE
4,P_140,P_140.png,D,60.791667,1,37.0,1.0,0.0,6.37,20.7,,,210.0,93.0,,97.3,,0.0,,MILD


In [17]:
train_data = pd.read_csv('Densnet_Threshold_Image_features_and_clinical_features.csv', delimiter=',', header= 0)
#train_data= train_data[['type_1021','type_179', 'type_296', 'type_336', 'type_508', 'type_880', 'type_971', 'type_974']]
train_data.head()

Unnamed: 0,ImageFile,type_1,type_2,type_3,type_4,type_5,type_6,type_7,type_8,type_9,...,type_1015,type_1016,type_1017,type_1018,type_1019,type_1020,type_1021,type_1022,type_1023,type_1024
0,P_131.png,0.000116,0.005017,0.003004,0.004077,0.091425,0.161575,0.00046,0.001935,0.20653,...,0.297045,1.24725,0.019518,0.595316,1.324033,1.954836,0.0,0.47885,0.343651,0.0
1,P_132.png,0.000109,0.007341,0.002726,0.003838,0.057239,0.193296,0.000274,0.00317,0.128739,...,1.247754,2.519083,0.024584,0.799841,3.991084,1.637588,0.012054,0.237635,0.972333,0.0
2,P_195.png,6.3e-05,0.004686,0.002076,0.004426,0.082883,0.25587,0.000354,0.003875,0.132531,...,2.26441,0.989827,0.043021,0.08702,1.8999,0.501653,0.0,1.54091,0.300492,0.001898
3,P_193.png,0.000101,0.006257,0.002484,0.004356,0.089308,0.127651,0.000383,0.002219,0.124669,...,2.971354,3.69225,0.072745,1.101624,3.889233,1.112687,0.0,0.029499,1.003721,0.0
4,P_140.png,5.3e-05,0.006186,0.002605,0.003957,0.094841,0.197833,0.000388,0.001924,0.161388,...,1.740142,3.068291,0.0,0.717635,2.338151,0.87365,0.0,0.911943,0.479188,0.0


In [18]:
# transform string output to binary 'MILD=0'
lables= pd.Series(np.where(clinical_data.Prognosis.values == 'MILD', 0, 1),clinical_data.index)

In [19]:
X= train_data.iloc[:,1:]   #X= dataset.iloc[:,1:47](when i have SO features as well) # Get features data in pd frame 
y= lables  # Get lables data in pd frame 

In [20]:
def scaling_data(X_train, X_test):
    
    scaler= MinMaxScaler()
    #scaler = StandardScaler()
    scaler.fit(X_train)# fit to train 
    X_train_scaled= scaler.transform(X_train)# transform train
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = X_train.columns, index=X_train.index) #convert train to dataframe 
    X_test_scaled= scaler.transform(X_test) #transform test
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = X_test.columns, index=X_test.index) #convert test to dataframe 
    
    return X_train_scaled_df, X_test_scaled_df

In [21]:
def feature_selection(X_train,y_train,X_test): 

    fs= SelectKBest(score_func=mutual_info_classif, k=30)
    fs.fit(X_train, y_train)
    X_train_feat= fs.transform(X_train)
    X_test_feat= fs.transform(X_test)
    mask = fs.get_support() #get a mask of selected features
    sig_features = X_train.columns[mask] # get feature names
    X_train_fs_df = pd.DataFrame(X_train_feat, columns =sig_features.tolist(), index=X_train.index) #convert train to dataframe
    X_test_fs_df = pd.DataFrame(X_test_feat, columns =sig_features.tolist(), index=X_test.index) #convert train to dataframe 
    return X_train_fs_df, X_test_fs_df, sig_features.tolist()

In [22]:
def fit_compute_results(X_train, y_train, X_test, y_test):
        
        model = LogisticRegression(solver='lbfgs',penalty='none',class_weight='balanced')
        #model = XGBClassifier()
        model.fit(X_train, y_train)
    

        # Getting Prediction Probabilties for Each Repetition on Training and Test Set of Selected Features
        #proba_train= model.predict_proba(X_train)
        #proba_test= model.predict_proba(X_test)
        
        
        proba_train= model.predict(X_train)
        proba_test= model.predict(X_test)
    
        # Getting Train and Test Probabilites for positive class to calculate train and test auc 
        #yhat_train= proba_train[:,1]
        #yhat_test= proba_test[:,1]
        
        yhat_train= proba_train
        yhat_test= proba_test
    
        #Getting Train and Test AUC
        #auc_train  = metrics.roc_auc_score(y_train, yhat_train)
        #auc_test= metrics.roc_auc_score(y_test, yhat_test)
        
        auc_train  = metrics.accuracy_score(y_train, yhat_train)
        auc_test= metrics.accuracy_score(y_test, yhat_test)
        
        return auc_train,auc_test

In [23]:
kfold  = RepeatedStratifiedKFold(n_splits=3, n_repeats=1,random_state=0)
sig_feat_count=[]
ROC_train= []
ROC_test= []

In [24]:
for train, test in kfold.split(X,y):
    
    # Scaling features
    X_train_scaled_df, X_test_scaled_df = scaling_data(X.iloc[train.tolist()], X.iloc[test.tolist()])
    
    # Selecting k-best features 
    X_train_fs, X_test_fs, final_feature_list= feature_selection(X_train_scaled_df,y[train],X_test_scaled_df)
    #Store filter selected feature in bootstraps of each fold
    sig_feat_count.append(final_feature_list)
        
    # Fitting Model on Single Feature in Training Batch
    auc_train, auc_test= fit_compute_results(X_train_fs,y[train],X_test_fs,y[test])
    ROC_train.append(auc_train)
    ROC_test.append(auc_test)

In [25]:
print('Train Accuracy Median: %.2f%%' % (np.median(ROC_train)*100))
print('Test Accuracy Median: %.2f%%' % (np.median(ROC_test)*100))

Train Accuracy Median: 67.65%
Test Accuracy Median: 67.01%


In [12]:
feat_count=pd.DataFrame(sig_feat_count).apply(pd.Series.value_counts).sum(axis=1) #sum occurence  of each features
final_feat_list= (feat_count[(feat_count >=2 )].index).tolist() # get features that have occurence above thresh