In [35]:
#%config IPCompleter.greedy=True

# Logistic Regression Model

### Base code: Julien Courbebaisse
### Revisions and modifications: Matheus Faria and Luiz Resende Silva 

The entire code was implemented using two main libraries to handle data and perform the calculations: Pandas and NumPy.

In [56]:
import numpy as np
import pandas as pd
import time
import timeit

Firstly, the function was implemented to be used to import the data sets, remove malformed values and extract basic statistics and pairwise correlation coerfficients between the columns (features and true labels)

In [65]:
def Read_File_DF(File_Name, separation = ",", head = None, replace = [], drop=True):
    try:
        separation = separation.lower()
        if(separation == "space" or separation == "tab"):
            separation = "\t"
        Raw_Data_Set = pd.read_csv(File_Name, delimiter=separation, header=head, na_values=replace)
        RawRowsColumns = Raw_Data_Set.shape
        if(replace != None):
            Missing = Raw_Data_Set.isnull().sum().sum()
            print("Total number of missing/anomalous 'entries' in the data set: ",Missing)
            if(drop == True):
                Raw_Data_Set.dropna(axis=0, how='any', inplace=True)
                CleanRowsColumns = Raw_Data_Set.shape
                print("Number of examples with missing values deleted from data set: ",(RawRowsColumns[0]-CleanRowsColumns[0]))
        Data_Set = Raw_Data_Set.to_numpy()
        return Data_Set
    except:
        print("READ_FILE_ERROR\n")

Other few functions were implemented to perform the binary classifiacation of the true labels from the data sets. The function were designed to be more general and allow modifications on both datasets

In [66]:
#Function replaces both by boundary (False) and fixed values (True) - each will be used in a different data set
def replace_class(vector, index, flag=False):
    if(flag == True):
        if vector[index]==4:
            vector[index]=1
            return vector
        elif(vector[index]==2):
            vector[index]=0
            return vector
    elif(flag == False):
        if (vector[index]<=5):
            vector[index]=0
            return vector
        elif(vector[index]>5):
            vector[index]=1
            return vector

#Function either adds (False) or substitutes (True) column by the intercept bias weights
def substi_add(dataset, index, flag=True): #Adding column of 1 to determine bias term
    if(flag==True):
        for element in dataset:
            element[index] = 1
        return dataset
    else:
        temp = dataset
        temp = np.insert(temp, 0, 1, axis=1)
        return temp    

In [67]:
def preproc(Dataset, Index1, Index2=None, Flag=True):
    if(Flag==True):
        for element in Dataset:
            element = replace_class(element,index=Index1, flag=Flag)
        Dataset = substi_add(Dataset, index=Index2, flag=Flag)
        return Dataset
    else:
        for element in Dataset:
            element = replace_class(element,index=Index1, flag=Flag)
        Dataset = substi_add(Dataset, index=Index2, flag=Flag)
        return Dataset

Another function was implemented to retrieve information about the data sets, such as general statistics and Spearman Rank Correlation between columns

In [68]:
def Data_Stats(Data, QQ_DD = True, show = True):
    try:
        Data_Set = pd.DataFrame(Data, index = (list(range(0,Data.shape[0]))), columns = (list(range(0,Data.shape[1]))))
        if(QQ_DD == True):           
            quantiles = [0.00, 0.25, 0.50, 0.75] #Calculating quartiles
        else:
            quantiles = [0.00, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00] #Calculating quartiles

        Describ = Data_Set.describe(percentiles = quantiles) #Data set general stats description
        Correlation = Data_Set.corr('spearman') #Computing pairwise feature correlation through Spearman rank correlation
        name = ("GeneralStats.xlsx")
        with pd.ExcelWriter(name) as writer: #Outputting Excel file with statistics
            Describ.to_excel(writer, sheet_name='Data_Description')
            Correlation.to_excel(writer, sheet_name='Column_Correlation')
        if(show == True):
            print(Data_Set)
            print(Describ) #Printing statistics to screen
            print(Correlation) #Printing statistics to screen
    except:
        print("STATS_FUNCTION_ERROR\n")

Lastly, the Linear Classification model was implemented as a Python class. The class was called logistiReg and all the functions needed to perform the Logistic Regression were implemented inside it

In [69]:
class logisticReg(object):
    """Class to build logistic regression model"""
        
    #Constructor called passing data set
    def __init__(self, dataset):
        self.dataset = dataset
    
    #Sigmoid function
    def sigmoid(self, a):
        sigma = (1/(1+(np.exp(-a))))
        return sigma
        
    #Update rule function
    def update(self, weights, dataset, Lrate):
        new_weights=weights #Initializing weights vector
        suma=0
        for element in dataset:
            suma += (((element[(self.Features_Selec)])*((element[(self.Outcome_Col)])-(self.sigmoid(new_weights.dot(element[(self.Features_Selec)]))))))
        new_weights = np.add(new_weights, (Lrate*suma)) #Updating weights
        return new_weights

    #Function to calculate the Cross-Entropy Loss
    def Cross_Entropy(self , data, weight):
        CE = 0
        for element in data:
            Term1 = np.dot(element[(self.Outcome_Col)], np.log(self.sigmoid(weight.dot(element[(self.Features_Selec)]))))
            Term2 = np.dot((1 - element[(self.Outcome_Col)]),np.log(1 - self.sigmoid(weight.dot(element[(self.Features_Selec)]))))
            CE += - np.add(Term1,Term2)
        return CE
    
    
    #Fitting model function    
    def fit_model(self, Training_Set, lstFeatures, outCol, rate=0.00001, maxIter=1000, reduct=1, ran=False):
        self.Features_Selec = lstFeatures #List of features selected to fit the model
        self.Outcome_Col = outCol #Index of outcome column
        
        if(ran == True):
            w = np.random.randint(0,10,len(self.Features_Selec)) #Starting vector of weights with random numbers between 0-10
        else:
            w = np.zeros(len(self.Features_Selec)) #Starting vector of weights with zeros
        CE_D = []
        i=0
        while i < maxIter:
            w = self.update(weights=w, dataset=Training_Set, Lrate=rate)
            cost = self.Cross_Entropy(data=Training_Set, weight=w)
            CE_D.append(cost)
            rate = rate/reduct
            i+=1
#        print(CE_D)
        return w
    
    #Function to use the weights calculated to perform predictions
    def predict(self, data, weights):
        truelabel = []
        predicted = []
        compare = []
        for example in data:
            realpred = np.dot(example[self.Features_Selec],weights)
            if(realpred <= 0.5):
                predicted.append(int(0))
            else:
                predicted.append(int(1))
            truelabel.append(int(example[self.Outcome_Col]))
        
        compare.append(truelabel)
        compare.append(predicted)
        return compare    

    def evaluate_acc(self, Preds):
        score = 0
        TP = 0
        FN = 0
        Predictions = np.transpose(Preds)
        for pred in Predictions:
            if pred[1]==pred[0]:
                score+=1
                if(pred[1]==1):
                    TP += 1
            else:
                if(pred[1]==0):
                    FN += 1
        
        print("Correct Predictions: ",score)
        print("Total Predictions: ",len(Predictions))
        print("Accuracy Percentage: ",round(((score/len(Predictions))*100),4),"%")
        accur = ((score/len(Predictions))*100)
        if(TP>0 or FN>0):
            print("Sensitivity Percentage: ",round(((TP/(TP+FN))*100),4),"%")
            sense = ((TP/(TP+FN))*100)
        else:
            print("Sensitivity Percentage: 0.0000%")
            sense = 0
        return accur, sense
    
    def kfold(self,k, data, LstFeatures, OutCol, Rate=0.00001, MaxIter=1000, Reduct=1, Ran=False):
        i=0
        score1=0
        score2=0
        batch_size=np.floor(data[:,1].size/k)
        while i < k: 
            if(i<k-1):
                training_data=np.delete(data, np.s_[(int)(i*batch_size):(int)((i+1)*batch_size)], 0)
                test_data= data[(int)(i*(batch_size)):(int)((i+1)*(batch_size))]
                weights = self.fit_model(training_data, LstFeatures, OutCol, rate=Rate, maxIter=MaxIter, reduct=Reduct, ran=Ran)
                preds = self.predict(test_data, weights)
                print("\nSegmentation #",i+1," Results:")
                temp1, temp2 = self.evaluate_acc(preds)
                score1+=temp1
                score2+=temp2
                i+=1
            else:
                training_data=np.delete(data, np.s_[(int)(i*batch_size):], 0)
                test_data= data[(int)(i*(batch_size)):]
                weights = self.fit_model(training_data, LstFeatures, OutCol, rate=Rate, maxIter=MaxIter, reduct=Reduct, ran=Ran)
                preds = self.predict(test_data, weights)
                print("\nSegmentation #",i+1," Results:")
                temp1, temp2 = self.evaluate_acc(preds)
                score1+=temp1
                score2+=temp2
                i+=1
        print("\n##### RESULTS FOR THE kFOLD CROSS VALIDATION WERE #####")
        print("Average Accuracy:",round((score1/k),3),"%")
        print("Average Sensitivity:",round((score2/k),3),"%")

With the above class and functions, we can perform the Logistic Regression in each of the data sets

## Breast Cancer Data Set

In the breast cancer data set, only 16 instances were found with incorrect values ("?"). These examples were removed from the data set when importing by flagging these "?" characters

In [42]:
#Load data and remove missing values
filename = 'breast-cancer-wisconsin.data'
databc = Read_File_DF(filename, separation=',', head=None, replace=["?"], drop=True)
databc

Total number of missing/anomalous 'entries' in the data set:  16
Number of examples with missing values deleted from data set:  16


array([[1.000025e+06, 5.000000e+00, 1.000000e+00, ..., 1.000000e+00,
        1.000000e+00, 2.000000e+00],
       [1.002945e+06, 5.000000e+00, 4.000000e+00, ..., 2.000000e+00,
        1.000000e+00, 2.000000e+00],
       [1.015425e+06, 3.000000e+00, 1.000000e+00, ..., 1.000000e+00,
        1.000000e+00, 2.000000e+00],
       ...,
       [8.888200e+05, 5.000000e+00, 1.000000e+01, ..., 1.000000e+01,
        2.000000e+00, 4.000000e+00],
       [8.974710e+05, 4.000000e+00, 8.000000e+00, ..., 6.000000e+00,
        1.000000e+00, 4.000000e+00],
       [8.974710e+05, 4.000000e+00, 8.000000e+00, ..., 4.000000e+00,
        1.000000e+00, 4.000000e+00]])

In the breast cancer data, the first column referent to the sample IDs was replaced by the intercept values since these information was not needed. Therefore, for this data set, the first weight in the vector $ w_{k}$ containing $k=m+1$ weights will be the intercept. The resulting data matrix will have6 683 rows and 11 columns: 9 features plus 1 intercept and the true labels

In [43]:
databc = preproc(databc, Index1=10, Index2=0, Flag=True)
databc

array([[ 1.,  5.,  1., ...,  1.,  1.,  0.],
       [ 1.,  5.,  4., ...,  2.,  1.,  0.],
       [ 1.,  3.,  1., ...,  1.,  1.,  0.],
       ...,
       [ 1.,  5., 10., ..., 10.,  2.,  1.],
       [ 1.,  4.,  8., ...,  6.,  1.,  1.],
       [ 1.,  4.,  8., ...,  4.,  1.,  1.]])

Statistics on the data set

In [44]:
Data_Stats(databc, QQ_DD = True, show = True)

      0     1     2     3     4    5     6     7     8    9    10
0    1.0   5.0   1.0   1.0   1.0  2.0   1.0   3.0   1.0  1.0  0.0
1    1.0   5.0   4.0   4.0   5.0  7.0  10.0   3.0   2.0  1.0  0.0
2    1.0   3.0   1.0   1.0   1.0  2.0   2.0   3.0   1.0  1.0  0.0
3    1.0   6.0   8.0   8.0   1.0  3.0   4.0   3.0   7.0  1.0  0.0
4    1.0   4.0   1.0   1.0   3.0  2.0   1.0   3.0   1.0  1.0  0.0
5    1.0   8.0  10.0  10.0   8.0  7.0  10.0   9.0   7.0  1.0  1.0
6    1.0   1.0   1.0   1.0   1.0  2.0  10.0   3.0   1.0  1.0  0.0
7    1.0   2.0   1.0   2.0   1.0  2.0   1.0   3.0   1.0  1.0  0.0
8    1.0   2.0   1.0   1.0   1.0  2.0   1.0   1.0   1.0  5.0  0.0
9    1.0   4.0   2.0   1.0   1.0  2.0   1.0   2.0   1.0  1.0  0.0
10   1.0   1.0   1.0   1.0   1.0  1.0   1.0   3.0   1.0  1.0  0.0
11   1.0   2.0   1.0   1.0   1.0  2.0   1.0   2.0   1.0  1.0  0.0
12   1.0   5.0   3.0   3.0   3.0  2.0   3.0   4.0   4.0  1.0  1.0
13   1.0   1.0   1.0   1.0   1.0  2.0   3.0   3.0   1.0  1.0  0.0
14   1.0  

Calling constructor for the 

In [74]:
modelBC = logisticReg(databc)

Defining the set/subset of features to be selected (_always between 0-9 with the 0 included_ - $w_{0}$)

In [78]:
feats = [0,1,2,3,4,5,6,7,8,9,10]

Defining the set/subset of examples to be used in training and validating the model ( _for uses outside kFold method_ )

In [79]:
training = databc[:400,:]
testing = databc[500:750,:]

Running model outside kFold method ( _single training and validation_ )

In [None]:
start = timeit.default_timer() #Starting clock to count time for training

weights = modelBC.fit_model(Training_Set=training, lstFeatures=feats, outCol=10, rate=0.00001, maxIter=1000, reduct=1, ran=False)

stop = timeit.default_timer()
print("Model trained in:",stop-start,"s")

weights

Validating the model

In [None]:
predictions = modelBC.predict(testing, weights)
predictions

Assessment of the model

In [None]:
modelBC.evaluate_acc(Preds=predictions)

Running kFolda Cross-Validation

In [83]:
start = timeit.default_timer() #Starting clock

modelBC.kfold(k=5,data=databc, LstFeatures=feats, OutCol=10, Rate=0.00001, MaxIter=1000, Reduct=1, Ran=False)

stop = timeit.default_timer()
print("kFold validation for Breast Cancer data ser performed in:",stop-start,"s")




Segmentation # 1  Results:
Correct Predictions:  126
Total Predictions:  136
Accuracy Percentage:  92.6471 %
Sensitivity Percentage:  88.5246 %

Segmentation # 2  Results:
Correct Predictions:  132
Total Predictions:  136
Accuracy Percentage:  97.0588 %
Sensitivity Percentage:  98.4375 %

Segmentation # 3  Results:
Correct Predictions:  135
Total Predictions:  136
Accuracy Percentage:  99.2647 %
Sensitivity Percentage:  97.9592 %

Segmentation # 4  Results:
Correct Predictions:  133
Total Predictions:  136
Accuracy Percentage:  97.7941 %
Sensitivity Percentage:  93.3333 %

Segmentation # 5  Results:
Correct Predictions:  136
Total Predictions:  139
Accuracy Percentage:  97.8417 %
Sensitivity Percentage:  91.4286 %

##### RESULTS FOR THE kFOLD CROSS VALIDATION WERE #####
Average Accuracy: 96.921 %
Average Sensitivity: 93.937 %
kFold validation for Breast Cancer data ser performed in: 24.679846299999554 s


## Red Wine Data Set

In the red wine quality data set, no instances with incorrect of missing values were found. Therefore, no example was removed from the data set

In [70]:
#Load data and remove missing values
filename = 'winequality_red.csv'
datawine = Read_File_DF(filename, separation=';', head=0, replace=["?"], drop=False)
datawine

Total number of missing/anomalous 'entries' in the data set:  0


array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

In this data set, the intercept value weights $ w_{0}$ were added as a new column to the beginning of the matrix. The resulting data matrix will have6 1599 rows and 12 columns: 10 features plus 1 intercept and the true labels

In [71]:
datawine = preproc(datawine, Index1=11, Index2=None, Flag=False)
datawine

array([[ 1.   ,  7.4  ,  0.7  , ...,  0.56 ,  9.4  ,  0.   ],
       [ 1.   ,  7.8  ,  0.88 , ...,  0.68 ,  9.8  ,  0.   ],
       [ 1.   ,  7.8  ,  0.76 , ...,  0.65 ,  9.8  ,  0.   ],
       ...,
       [ 1.   ,  6.3  ,  0.51 , ...,  0.75 , 11.   ,  1.   ],
       [ 1.   ,  5.9  ,  0.645, ...,  0.71 , 10.2  ,  0.   ],
       [ 1.   ,  6.   ,  0.31 , ...,  0.66 , 11.   ,  1.   ]])

Statistics on the data set

In [72]:
Data_Stats(datawine, QQ_DD = True, show = True)

       0     1      2     3     4      5     6      7        8     9     10  \
0     1.0   7.4  0.700  0.00   1.9  0.076  11.0   34.0  0.99780  3.51  0.56   
1     1.0   7.8  0.880  0.00   2.6  0.098  25.0   67.0  0.99680  3.20  0.68   
2     1.0   7.8  0.760  0.04   2.3  0.092  15.0   54.0  0.99700  3.26  0.65   
3     1.0  11.2  0.280  0.56   1.9  0.075  17.0   60.0  0.99800  3.16  0.58   
4     1.0   7.4  0.700  0.00   1.9  0.076  11.0   34.0  0.99780  3.51  0.56   
5     1.0   7.4  0.660  0.00   1.8  0.075  13.0   40.0  0.99780  3.51  0.56   
6     1.0   7.9  0.600  0.06   1.6  0.069  15.0   59.0  0.99640  3.30  0.46   
7     1.0   7.3  0.650  0.00   1.2  0.065  15.0   21.0  0.99460  3.39  0.47   
8     1.0   7.8  0.580  0.02   2.0  0.073   9.0   18.0  0.99680  3.36  0.57   
9     1.0   7.5  0.500  0.36   6.1  0.071  17.0  102.0  0.99780  3.35  0.80   
10    1.0   6.7  0.580  0.08   1.8  0.097  15.0   65.0  0.99590  3.28  0.54   
11    1.0   7.5  0.500  0.36   6.1  0.071  17.0  102

Calling constructor for the 

In [84]:
modelWine = logisticReg(datawine)

Defining the set/subset of features to be selected (_always between 0-10 with the 0 included_ - $w_{0}$)

In [96]:
feats = [0,1,2,3,4,5,6,7,8,9,11]

Defining the set/subset of examples to be used in training and validating the model ( _for uses outside kFold method_ )

In [95]:
training = datawine[250:750,:]
testing = datawine[1000:1350,:]

Running model outside kFold method ( _single training and validation_ )

In [None]:
start = timeit.default_timer() #Starting clock to count time for training

weights = modelWine.fit_model(Training_Set=training, lstFeatures=feats, outCol=12, rate=0.00001, maxIter=100, reduct=1, ran=False)

stop = timeit.default_timer()
print("Model trained in:",stop-start,"s")

weights

Validating the model

In [None]:
predictions = modelWine.predict(testing, weights)
predictions

Assessment of the model

In [None]:
modelWine.evaluate_acc(Preds=predictions)

Running kFolda Cross-Validation

In [97]:
start = timeit.default_timer() #Starting clock

modelWine.kfold(k=5,data=datawine, LstFeatures=feats, OutCol=12, Rate=0.00001, MaxIter=1000, Reduct=1, Ran=False)

stop = timeit.default_timer()
print("kFold validation for Wine Data set performed in:",stop-start,"s")




Segmentation # 1  Results:
Correct Predictions:  197
Total Predictions:  319
Accuracy Percentage:  61.7555 %
Sensitivity Percentage:  4.8387 %

Segmentation # 2  Results:
Correct Predictions:  207
Total Predictions:  319
Accuracy Percentage:  64.8903 %
Sensitivity Percentage:  93.8462 %

Segmentation # 3  Results:
Correct Predictions:  167
Total Predictions:  319
Accuracy Percentage:  52.3511 %
Sensitivity Percentage:  98.0132 %

Segmentation # 4  Results:
Correct Predictions:  225
Total Predictions:  319
Accuracy Percentage:  70.5329 %
Sensitivity Percentage:  93.2432 %

Segmentation # 5  Results:
Correct Predictions:  163
Total Predictions:  323
Accuracy Percentage:  50.4644 %
Sensitivity Percentage:  95.092 %

##### RESULTS FOR THE kFOLD CROSS VALIDATION WERE #####
Average Accuracy: 59.999 %
Average Sensitivity: 77.007 %
kFold validation for Wine Data set performed in: 475.23168780000015 s
