In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report 
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_excel("Training_Data.xlsx")
df

Unnamed: 0,PatientId,EncounterId,DischargeDisposision,Gender,Race,DiabetesMellitus,ChronicKidneyDisease,Anemia,Depression,ChronicObstructivePulmonaryDisease,...,BetaBlockers,Diuretics,TotalMedicine,CardiacTroponin,Hemoglobin,SerumSodium,SerumCreatinine,BNP,NT-proBNP,ReadmissionWithin_90Days
0,4200412,199171333,Home,Male,White,DM,,Anemia,,COPD,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,Yes
1,4055894,26704337,Home,Male,White,DM,CKD,Anemia,Depression,COPD,...,1,5,8,0.0,0.00,0.0,1.540000,0.0,0.0,No
2,4867407,60388216,Home,Male,White,DM,CKD,Anemia,,COPD,...,1,1,2,0.0,10.20,0.0,0.000000,0.0,0.0,No
3,4058064,274642265,Hospice - Home,Female,White,DM,,Anemia,,COPD,...,0,0,0,0.0,0.00,132.0,0.000000,0.0,0.0,No
4,4150623,70000001557327,SNF,Female,White,,,Anemia,,COPD,...,0,0,0,0.0,7.26,0.0,0.000000,0.0,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476,4152524,40004415567,Home Health,Female,White,DM,CKD,Anemia,,COPD,...,2,3,6,0.0,0.00,0.0,1.076667,0.0,0.0,Yes
8477,4042227,14347947026,SNF,Male,White,DM,CKD,,Depression,COPD,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,Yes
8478,4603405,67117733,Hospice,Female,White,,,,,,...,1,0,1,0.0,0.00,0.0,0.000000,0.0,0.0,No
8479,4033677,68564389,Home,Female,White,DM,CKD,,Depression,,...,1,1,5,0.0,0.00,0.0,1.690000,0.0,0.0,Yes


In [3]:
df.isnull().sum() #to check which columns have nan value

PatientId                                0
EncounterId                              0
DischargeDisposision                     0
Gender                                   0
Race                                    93
DiabetesMellitus                      3857
ChronicKidneyDisease                  3906
Anemia                                3002
Depression                            5108
ChronicObstructivePulmonaryDisease    3954
Age                                      0
ChronicDiseaseCount                      0
LengthOfStay                             0
EmergencyVisit                           0
InpatientVisit                           0
OutpatientVisit                          0
TotalVisits                              0
BMIMin                                   0
BMIMax                                   0
BMIMedian                                0
BMIMean                                  0
BPDiastolicMin                           0
BPDiastolicMax                           0
BPDiastolic

In [6]:
def preProcessing(df):
    #replacing all the nan values with 0s in the non numerical columns
    df['Race'] = df['Race'].replace(np.nan, '0')
    df['DiabetesMellitus'] = df['DiabetesMellitus'].replace(np.nan, '0')
    df['ChronicKidneyDisease'] = df['ChronicKidneyDisease'].replace(np.nan, '0')
    df['Anemia'] = df['Anemia'].replace(np.nan, '0')
    df['Depression '] = df['Depression '].replace(np.nan, '0')
    df['ChronicObstructivePulmonaryDisease'] = df['ChronicObstructivePulmonaryDisease'].replace(np.nan, '0')

    #to check if nan values have been removed or not
    a = df.isnull().sum()
    print("Nan values:\n",a,"\n")
    #to see the number of zeros in each column
    for i in df.columns:
        count = (df[i]==0).sum()
        print(i)
        print(count,"\n")
        
    #for coulmns with less than 3500 0s, replace the 0s with the mean of that column. 
    #if we will replace 0s with mean in columns with more 0s, the data will become overfit

    df['ChronicDiseaseCount'] = df['ChronicDiseaseCount'].replace(0, df['ChronicDiseaseCount'].mean())
    df['Age'] = df['Age'].replace(0, df['Age'].mean())
    df['EmergencyVisit'] = df['EmergencyVisit'].replace(0, df['EmergencyVisit'].mean())
    df['InpatientVisit'] = df['InpatientVisit'].replace(0, df['InpatientVisit'].mean())
    df['TotalVisits'] = df['TotalVisits'].replace(0, df['TotalVisits'].mean())
    
#     #storing the names of all the columns with strings
#     ar = ['Race','DischargeDisposision','Gender','DiabetesMellitus','ChronicKidneyDisease','Anemia','Depression ','ChronicObstructivePulmonaryDisease','ReadmissionWithin_90Days']
#     print(ar, "\n")
    
    #we change str values to int as encoders work on uniform data 

    #if number written as a string , to numeric turns str into int
    df['PatientId']   = pd.to_numeric(df['PatientId'])
    #corece replaces values such as "PH1500179668" with nan
    df['EncounterId'] = pd.to_numeric(df['EncounterId'],errors='coerce')
    # df['EncounterID'] = df['EncounterID'].fillna(0)
    df['EncounterId'].fillna(0,inplace = True)
    
    le = LabelEncoder()   
    #labeling all the column names
    #storing the names of all the columns with strings
    ar = ['Race','DischargeDisposision','Gender','DiabetesMellitus','ChronicKidneyDisease','Anemia','Depression ','ChronicObstructivePulmonaryDisease','ReadmissionWithin_90Days']
    df[ar] = df[ar].apply(le.fit_transform)
    
    return df
preProcessing(df)

Nan values:
 PatientId                             0
EncounterId                           0
DischargeDisposision                  0
Gender                                0
Race                                  0
DiabetesMellitus                      0
ChronicKidneyDisease                  0
Anemia                                0
Depression                            0
ChronicObstructivePulmonaryDisease    0
Age                                   0
ChronicDiseaseCount                   0
LengthOfStay                          0
EmergencyVisit                        0
InpatientVisit                        0
OutpatientVisit                       0
TotalVisits                           0
BMIMin                                0
BMIMax                                0
BMIMedian                             0
BMIMean                               0
BPDiastolicMin                        0
BPDiastolicMax                        0
BPDiastolicMedian                     0
BPDiastolicMean            

Unnamed: 0,PatientId,EncounterId,DischargeDisposision,Gender,Race,DiabetesMellitus,ChronicKidneyDisease,Anemia,Depression,ChronicObstructivePulmonaryDisease,...,BetaBlockers,Diuretics,TotalMedicine,CardiacTroponin,Hemoglobin,SerumSodium,SerumCreatinine,BNP,NT-proBNP,ReadmissionWithin_90Days
0,4200412,1.991713e+08,7,1,7,1,0,1,0,1,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,1
1,4055894,2.670434e+07,7,1,7,1,1,1,1,1,...,1,5,8,0.0,0.00,0.0,1.540000,0.0,0.0,0
2,4867407,6.038822e+07,7,1,7,1,1,1,0,1,...,1,1,2,0.0,10.20,0.0,0.000000,0.0,0.0,0
3,4058064,2.746423e+08,11,0,7,1,0,1,0,1,...,0,0,0,0.0,0.00,132.0,0.000000,0.0,0.0,0
4,4150623,7.000000e+13,19,0,7,0,0,1,0,1,...,0,0,0,0.0,7.26,0.0,0.000000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8476,4152524,4.000442e+10,8,0,7,1,1,1,0,1,...,2,3,6,0.0,0.00,0.0,1.076667,0.0,0.0,1
8477,4042227,1.434795e+10,19,1,7,1,1,0,1,1,...,0,0,0,0.0,0.00,0.0,0.000000,0.0,0.0,1
8478,4603405,6.711773e+07,10,0,7,0,0,0,0,0,...,1,0,1,0.0,0.00,0.0,0.000000,0.0,0.0,0
8479,4033677,6.856439e+07,7,0,7,1,1,0,1,0,...,1,1,5,0.0,0.00,0.0,1.690000,0.0,0.0,1


In [7]:
X = df.drop('ReadmissionWithin_90Days',axis=1)
y = df['ReadmissionWithin_90Days']
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2 , random_state = 60)

In [8]:
 #classifier 
from sklearn.ensemble import RandomForestClassifier

c4 = RandomForestClassifier(max_depth=2, random_state=0)
c4.fit(X_train, y_train)
p = c4.predict(X_test)
print ("Accuracy : " , accuracy_score(y_test,p)*100)  
print("Report : \n", classification_report(y_test, p))
print("F1 Score : ",f1_score(y_test, p, average='macro')*100)
in_a = accuracy_score(y_test,p)*100

Accuracy :  68.53270477312905
Report : 
               precision    recall  f1-score   support

           0       0.69      1.00      0.81      1163
           1       0.00      0.00      0.00       534

    accuracy                           0.69      1697
   macro avg       0.34      0.50      0.41      1697
weighted avg       0.47      0.69      0.56      1697

F1 Score :  40.66433566433567


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
def pop_init(size): #initialize popluation
    #making random arrays of 1s and 0s
    arr=[0,1] #1s and 0s becuase we need integer values to add velocities
    select = np.random.choice(arr,size=(size,56))
    return select 

size = 60
select = pop_init(size)
print (select)


[[1 1 0 ... 1 0 0]
 [1 1 0 ... 0 0 1]
 [0 0 1 ... 1 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]
 [1 0 1 ... 0 1 1]]


In [10]:
def colSelect(pop,df): #passing population of 1s and 0s and the df without the last col
    boolarr = pop.astype(bool) #converting array of 1s and 0s to true and false
    s = df.loc[:,boolarr] #using the true false to select the columns with true values
    return s #returning df with those true columns

In [11]:
def fitnessFunc(s): 
   #X = df.drop('ReadmissionWithin_90Days',axis=1) #this is s that is being passed in the parameter
    y = df['ReadmissionWithin_90Days']
    X_train, X_test, y_train, y_test = train_test_split(s, y , test_size = 0.2 , random_state = 60)
    c4 =  RandomForestClassifier(n_estimators = 20, max_depth=25, criterion = "entropy", min_samples_split=10) #using random forest as it has the best accuracy
    c4.fit(X_train, y_train) #fitness function
    p = c4.predict(X_test)
    a = accuracy_score(y_test,p)*100 #finding accuracies 
    return a

In [12]:
def Velocity(pbest,gbest,pop): #function to update the position of the particle
    X = pop
    r1 = np.random.uniform(0,1) #generating random numbers to use in formula
    c1 = 2 #we usually keep this value 2 but can be any other small number
    r2 = np.random.uniform(0,1)
    c2 = 1  #can be a small number 1-4
    arr = []
    l = len(X) #size is 60 so 60 rows, willcalculate velocity for 60 rows
    for i in range(l):
        v = (r1*c1)*(pbest[i]- X[i]) + (r2*c2)*(gbest - X[i]) #formula to calculate velocities
        arr.append(v)
    return arr
    
    

In [15]:
pop = pop_init(60)  # calling function where population is initialized of s and 0s
X = df.drop('ReadmissionWithin_90Days', axis = 1)

def PSOalgo(pop,X) :
    cmp = 0.0
    count = 0
    accuracies = [] #array to store accuracies
    for i in range(0,60): #size of pop was 60
        s = colSelect(pop[i],X) #storing the df with only columns with true values
        f = fitnessFunc(s) #finding accuracy
        accuracies.append(f) 
    print("\nMax Accuracy of population:", max(accuracies)) #finding maximum accuracy out of those 60 accuracies 
    oldpop = pop #storing the original popluation generated
    mat = pop 
    for a in range (0,70): #run for 70 iterations
        
        pbest  = mat
        gbestacc= max(accuracies) #storing the max accuracy
        index = np.argmax(accuracies, axis = 0) #returns the index of highest accuracy in the array accuracies
        gbest = pbest[index] #pbest has the population and this will give the value stored at the highest accuracy index of pop
        vel = Velocity(pbest,gbest, pop)  #finding velocity
        #gbest = np.array(gbest)
        pop = pop + vel #updating population (positions) but this gives values other than 1s and 0s
        
        #to convert back into 1s and 0s
        for i in range(len(pop)):
            maxx = max(pop[i]) 
            pop[i] = pop[i] / maxx 
             
        for i in range(60):
            for j in range(0,56):
                if(pop[i][j] > 0.5).any():
                    pop[i][j] = 1
                else:
                    pop[i][j] = 0
        
        neww = pop # new particles stored here 
    
        newBest = []
        newacc = [] #to find new accuracies of new particles
        fit = []
        #finding fitness values of new particles
        for i in range(0,60):#because size in pop init function is 60
            s = colSelect(neww[i],X)
            f = fitnessFunc(s)
            fit.append(f) #store in fitness array
         
        #comparing and assigning highest fitness values
        for i in range(60):
            if(fit[i] > accuracies[i]):
                newBest.append(neww[i])
                newacc.append(fit[i])
            else:
                newBest.append(oldpop[i])
                newacc.append(accuracies[i])
            
        mat =  np.array(newBest) #best accuracy one stored here
        accuracies = newacc
        
        m_ac = max(accuracies)
        
        #termination condition for stability
        
        if cmp == m_ac:
            count = count + 1
        cmp = m_ac
        
        if count == 7: #for stability the similar accuracies occurs at least 7 times
            p = colSelect(gbest,X) #gbest has 1s and 0s, we need true false
            print("Particle which gives best accuracy: ",p)
            print("\nThe maximum acc is \n",gbestacc)
            g = gbestacc
            return p,g
        
        print("\nMaximum Accuracy is: ", gbestacc) 
        
        if(gbestacc) > 73: #terminating coondition, our algo will try to reach this accuracy
            p = colSelect(gbest,X)
            print("Particle which gives best accuracy:\n ",p)
            print("\nThe maximum acc is \n",gbestacc)
            g = gbestacc
            return p, g 

p, g = PSOalgo(pop,X) 


Max Accuracy of population: 72.00942840306423

Maximum Accuracy is:  72.00942840306423

Maximum Accuracy is:  72.00942840306423

Maximum Accuracy is:  72.00942840306423

Maximum Accuracy is:  73.01119622863878
Particle which gives best accuracy:
         EncounterId  DischargeDisposision  ChronicKidneyDisease  \
0     1.991713e+08                     7                     0   
1     2.670434e+07                     7                     1   
2     6.038822e+07                     7                     1   
3     2.746423e+08                    11                     0   
4     7.000000e+13                    19                     0   
...            ...                   ...                   ...   
8476  4.000442e+10                     8                     1   
8477  1.434795e+10                    19                     1   
8478  6.711773e+07                    10                     0   
8479  6.856439e+07                     7                     1   
8480  1.452511e+09       

In [16]:
print("Initial accuracy without PSO : ",in_a)
print("Final accuracy after PSO: ",g)

Initial accuracy without PSO :  68.53270477312905
Final accuracy after PSO:  73.01119622863878
