In [60]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report 
from sklearn.tree import DecisionTreeClassifier

In [61]:
df = pd.read_excel("Training_Data.xlsx")
df

Unnamed: 0,PatientId,EncounterId,DischargeDisposision,Gender,Race,DiabetesMellitus,ChronicKidneyDisease,Anemia,Depression,ChronicObstructivePulmonaryDisease,...,BetaBlockers,Diuretics,TotalMedicine,CardiacTroponin,Hemoglobin,SerumSodium,SerumCreatinine,BNP,NT-proBNP,ReadmissionWithin_90Days
0,4200412,199171333,Home,Male,White,DM,,Anemia,,COPD,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,Yes
1,4055894,26704337,Home,Male,White,DM,CKD,Anemia,Depression,COPD,...,1,5,8,0.0,0.00,0.0,1.540000,0.0,0.0,No
2,4867407,60388216,Home,Male,White,DM,CKD,Anemia,,COPD,...,1,1,2,0.0,10.20,0.0,0.000000,0.0,0.0,No
3,4058064,274642265,Hospice - Home,Female,White,DM,,Anemia,,COPD,...,0,0,0,0.0,0.00,132.0,0.000000,0.0,0.0,No
4,4150623,70000001557327,SNF,Female,White,,,Anemia,,COPD,...,0,0,0,0.0,7.26,0.0,0.000000,0.0,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476,4152524,40004415567,Home Health,Female,White,DM,CKD,Anemia,,COPD,...,2,3,6,0.0,0.00,0.0,1.076667,0.0,0.0,Yes
8477,4042227,14347947026,SNF,Male,White,DM,CKD,,Depression,COPD,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,Yes
8478,4603405,67117733,Hospice,Female,White,,,,,,...,1,0,1,0.0,0.00,0.0,0.000000,0.0,0.0,No
8479,4033677,68564389,Home,Female,White,DM,CKD,,Depression,,...,1,1,5,0.0,0.00,0.0,1.690000,0.0,0.0,Yes


In [62]:
df.isnull().sum() #to check which columns have nan values

PatientId                                0
EncounterId                              0
DischargeDisposision                     0
Gender                                   0
Race                                    93
DiabetesMellitus                      3857
ChronicKidneyDisease                  3906
Anemia                                3002
Depression                            5108
ChronicObstructivePulmonaryDisease    3954
Age                                      0
ChronicDiseaseCount                      0
LengthOfStay                             0
EmergencyVisit                           0
InpatientVisit                           0
OutpatientVisit                          0
TotalVisits                              0
BMIMin                                   0
BMIMax                                   0
BMIMedian                                0
BMIMean                                  0
BPDiastolicMin                           0
BPDiastolicMax                           0
BPDiastolic

In [63]:
def preProcessing(df):
    #replacing all the nan values with 0s in the non numerical columns
    df['Race'] = df['Race'].replace(np.nan, '0')
    df['DiabetesMellitus'] = df['DiabetesMellitus'].replace(np.nan, '0')
    df['ChronicKidneyDisease'] = df['ChronicKidneyDisease'].replace(np.nan, '0')
    df['Anemia'] = df['Anemia'].replace(np.nan, '0')
    df['Depression '] = df['Depression '].replace(np.nan, '0')
    df['ChronicObstructivePulmonaryDisease'] = df['ChronicObstructivePulmonaryDisease'].replace(np.nan, '0')

    #to check if nan values have been removed or not
    a = df.isnull().sum()
    print("Nan values:\n",a,"\n")
    #to see the number of zeros in each column
    for i in df.columns:
        count = (df[i]==0).sum()
        print(i)
        print(count,"\n")
        
    #for coulmns with less than 3500 0s, replace the 0s with the mean of that column. 
    #if we will replace 0s with mean in columns with more 0s, the data will become overfit

    df['ChronicDiseaseCount'] = df['ChronicDiseaseCount'].replace(0, df['ChronicDiseaseCount'].mean())
    df['Age'] = df['Age'].replace(0, df['Age'].mean())
    df['EmergencyVisit'] = df['EmergencyVisit'].replace(0, df['EmergencyVisit'].mean())
    df['InpatientVisit'] = df['InpatientVisit'].replace(0, df['InpatientVisit'].mean())
    df['TotalVisits'] = df['TotalVisits'].replace(0, df['TotalVisits'].mean())
    
#     #storing the names of all the columns with strings
#     ar = ['Race','DischargeDisposision','Gender','DiabetesMellitus','ChronicKidneyDisease','Anemia','Depression ','ChronicObstructivePulmonaryDisease','ReadmissionWithin_90Days']
#     print(ar, "\n")
    
    #we change str values to int as encoders work on uniform data 

    #if number written as a string , to numeric turns str into int
    df['PatientId']   = pd.to_numeric(df['PatientId'])
    #corece replaces values such as "PH1500179668" with nan
    df['EncounterId'] = pd.to_numeric(df['EncounterId'],errors='coerce')
    # df['EncounterID'] = df['EncounterID'].fillna(0)
    df['EncounterId'].fillna(0,inplace = True)
    
    le = LabelEncoder()   
    #labeling all the column names
    #storing the names of all the columns with strings
    ar = ['Race','DischargeDisposision','Gender','DiabetesMellitus','ChronicKidneyDisease','Anemia','Depression ','ChronicObstructivePulmonaryDisease','ReadmissionWithin_90Days']
    df[ar] = df[ar].apply(le.fit_transform)
    
    return df
preProcessing(df)

Nan values:
 PatientId                             0
EncounterId                           0
DischargeDisposision                  0
Gender                                0
Race                                  0
DiabetesMellitus                      0
ChronicKidneyDisease                  0
Anemia                                0
Depression                            0
ChronicObstructivePulmonaryDisease    0
Age                                   0
ChronicDiseaseCount                   0
LengthOfStay                          0
EmergencyVisit                        0
InpatientVisit                        0
OutpatientVisit                       0
TotalVisits                           0
BMIMin                                0
BMIMax                                0
BMIMedian                             0
BMIMean                               0
BPDiastolicMin                        0
BPDiastolicMax                        0
BPDiastolicMedian                     0
BPDiastolicMean            

Unnamed: 0,PatientId,EncounterId,DischargeDisposision,Gender,Race,DiabetesMellitus,ChronicKidneyDisease,Anemia,Depression,ChronicObstructivePulmonaryDisease,...,BetaBlockers,Diuretics,TotalMedicine,CardiacTroponin,Hemoglobin,SerumSodium,SerumCreatinine,BNP,NT-proBNP,ReadmissionWithin_90Days
0,4200412,1.991713e+08,7,1,7,1,0,1,0,1,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,1
1,4055894,2.670434e+07,7,1,7,1,1,1,1,1,...,1,5,8,0.0,0.00,0.0,1.540000,0.0,0.0,0
2,4867407,6.038822e+07,7,1,7,1,1,1,0,1,...,1,1,2,0.0,10.20,0.0,0.000000,0.0,0.0,0
3,4058064,2.746423e+08,11,0,7,1,0,1,0,1,...,0,0,0,0.0,0.00,132.0,0.000000,0.0,0.0,0
4,4150623,7.000000e+13,19,0,7,0,0,1,0,1,...,0,0,0,0.0,7.26,0.0,0.000000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476,4152524,4.000442e+10,8,0,7,1,1,1,0,1,...,2,3,6,0.0,0.00,0.0,1.076667,0.0,0.0,1
8477,4042227,1.434795e+10,19,1,7,1,1,0,1,1,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,1
8478,4603405,6.711773e+07,10,0,7,0,0,0,0,0,...,1,0,1,0.0,0.00,0.0,0.000000,0.0,0.0,0
8479,4033677,6.856439e+07,7,0,7,1,1,0,1,0,...,1,1,5,0.0,0.00,0.0,1.690000,0.0,0.0,1


In [64]:
X = df.drop('ReadmissionWithin_90Days',axis=1)
y = df['ReadmissionWithin_90Days']
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2 , random_state = 60)

In [65]:
def model(df):
    from sklearn.neighbors import KNeighborsClassifier #classifier 1 KNN
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    from sklearn.metrics import classification_report
    
    X = df.drop('ReadmissionWithin_90Days',axis=1)
    y = df['ReadmissionWithin_90Days']

    X_train,X_test,y_train,y_test=train_test_split(X, y ,test_size=0.2,random_state=51)

    c1 = KNeighborsClassifier()
    c1.fit(X_train, y_train)
    p = c1.predict(X_test)
    c1.score(X_test,y_test)
    print ("Accuracy : " , accuracy_score(y_test,p)*100)  
    print("Report : \n", classification_report(y_test, p))
    print("F1 Score : ",f1_score(y_test, p, average='macro')*100)    
model(df)

Accuracy :  64.93812610489098
Report : 
               precision    recall  f1-score   support

           0       0.72      0.82      0.77      1195
           1       0.36      0.24      0.29       502

    accuracy                           0.65      1697
   macro avg       0.54      0.53      0.53      1697
weighted avg       0.61      0.65      0.63      1697

F1 Score :  52.89728476496809


In [66]:
from sklearn import tree #classifier 2 Decision Tree 

c2 = tree.DecisionTreeClassifier()
c2 = c2.fit(X_train, y_train)
p = c2.predict(X_test)
c2.score(X_test,y_test)

print ("Accuracy : " , accuracy_score(y_test,p)*100)  
print("Report : \n", classification_report(y_test, p))
print("F1 Score : ",f1_score(y_test, p, average='macro')*100)    

Accuracy :  65.46847377725398
Report : 
               precision    recall  f1-score   support

           0       0.75      0.75      0.75      1163
           1       0.45      0.46      0.45       534

    accuracy                           0.65      1697
   macro avg       0.60      0.60      0.60      1697
weighted avg       0.66      0.65      0.66      1697

F1 Score :  60.08949784884094


In [67]:
 #classifier 3
from sklearn.ensemble import RandomForestClassifier

c4 = RandomForestClassifier(max_depth=2, random_state=0)
c4.fit(X_train, y_train)
p = c4.predict(X_test)
print ("Accuracy : " , accuracy_score(y_test,p)*100)  
print("Report : \n", classification_report(y_test, p))
print("F1 Score : ",f1_score(y_test, p, average='macro')*100)
in_a = accuracy_score(y_test,p)*100

Accuracy :  68.53270477312905
Report : 
               precision    recall  f1-score   support

           0       0.69      1.00      0.81      1163
           1       0.00      0.00      0.00       534

    accuracy                           0.69      1697
   macro avg       0.34      0.50      0.41      1697
weighted avg       0.47      0.69      0.56      1697

F1 Score :  40.66433566433567


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
print('Shape = ', df.shape)

Shape =  (8481, 57)


In [78]:
def pop_init(size): #initialize popluation
    arr=[True,False]
    select = np.random.choice(arr,size=(size,56))
    return select
size = 20
select = pop_init(size)
print (select)
#X.loc[:,select[0]] prints col with true values at 0 index

[[False False False ... False False  True]
 [ True  True  True ... False False False]
 [False False False ... False False  True]
 ...
 [False  True False ... False False  True]
 [False  True False ... False  True False]
 [False  True  True ... False False  True]]


In [70]:
def fitnessFunc(s):
    X = df.drop('ReadmissionWithin_90Days',axis=1)
    y = df['ReadmissionWithin_90Days']
    X_train, X_test, y_train, y_test = train_test_split(s, y , test_size = 0.2 , random_state = 10)

    c4 = RandomForestClassifier(max_depth=None, random_state=0) #using random forest as it has the best accuracy
    c4.fit(X_train, y_train) #fitness function
    p = c4.predict(X_test)
    a = accuracy_score(y_test,p)*100 #finding accuracies for
    return a

In [71]:
#crossover
def crossOver(se): #passing the chromosome
    cross = []
    # se = np.random.choice(arr,size=(40,56))
    #print("select:\n",se,"\n")
    se = se[:10] #pop size is 20
    split =  np.random.randint(10)#flips randomly 10 elements
    #print("split:\n",split)
    for i in range (0,10,2): #2 steps
        if(i+1 != 10):
            first = se[i][:split]
            #f = first[:split] #double 
            second = se[i][split:]
            third = se[i+1][:split]
            fourth = se[i+1][split:]
            a = np.concatenate((first,fourth),axis = 0)
            b = np.concatenate((second,third),axis = 0)
            cross.append(a)
            cross.append(b)
    r = np.concatenate((cross,se),axis = 0) #make population size same again
    return r
#crossOver(se)

In [72]:
def mutation(newC):
    for j in range(10,20):  #10 till 20 are the child we made in crossover
        #mutate different values for eveery index
        arr2 = np.random.randint(0,56,size = 7) #mutating 10 out of 56 elements
        for i in arr2:
            #print(i)
            if newC[j][i] == True: #picking last 10 children for mutation
                newC[j][i] = False
            else:
                newC[j][i] = True
        return newC

In [77]:
def geneticAlgo(df,select) :
    maximum = []
    count = 0
    cmp = 0.0
    pop = df.drop('ReadmissionWithin_90Days', axis = 1)
    for a in range (0,5):
        accuracies = []
        from sklearn import tree
        #select[i] is chromosome
        for i in range(0,20): #pop size
            s = pop.loc[:,select[i]] #access boolean array picks only true values
            f = fitnessFunc(s)
            accuracies.append(f) 
            #print(i,f)
        print("\nMaximum Accuracy:", max(accuracies))
        maximum.append(max(accuracies))
        #sorting the accuracy list in descending order
        m_ac = max(accuracies) 
        #termination condition 2 for stability
        if cmp == m_ac:
            count = count + 1
        cmp = m_ac
        
        if count == 5: #for stability the similar accuracies occurs at least 5 time
            break
        
        for i in range(0,20):
            for j in range(0,20):
                if j+1 < 20:
                    if (accuracies[j+1]  > accuracies[j]):
                        temp = accuracies[j+1] 
                        accuracies[j+1] = accuracies[j]
                        accuracies[j] = temp
                        temp = select[j+1]
                        select[j+1] = select[j]
                        select[j] = temp
        #print("sorted:\n")
        #print(accuracies,"\n",select)
        o = crossOver(select)
        select = mutation(o) 
        if(max(accuracies)) > 72: #prints col of best accu
            print("Chromosome which gives best accuracy: ",pop.loc[:,select[0]])
            print("\nThe maximum acc is \n",max(accuracies))
        if(max(accuracies)) > 73: #termination condition
            break
    return max(maximum)
ga = geneticAlgo(df,select)


Maximum Accuracy: 70.77195050088392

Maximum Accuracy: 72.06835592221567
Chromosome which gives best accuracy:        PatientId  DischargeDisposision  Gender  Race  DiabetesMellitus  \
0       4200412                     7       1     7                 1   
1       4055894                     7       1     7                 1   
2       4867407                     7       1     7                 1   
3       4058064                    11       0     7                 1   
4       4150623                    19       0     7                 0   
...         ...                   ...     ...   ...               ...   
8476    4152524                     8       0     7                 1   
8477    4042227                    19       1     7                 1   
8478    4603405                    10       0     7                 0   
8479    4033677                     7       0     7                 1   
8480    4036902                     7       1     7                 0   

      Depr

In [53]:
print ("Accuarcy Initially: ", in_a)
print("Accuracy after GA: ", ga)

Accuarcy Initially:  68.53270477312905
Accuracy after GA:  72.30406599882146
