# Linear Discriminant Analysis (LDA) Model
## COMP 551 Project 01

#### Luiz Resende Silva - 260852243

As per recomendation in the project instructions, the model was implemented as a Python Class called LDA_Model(). Bofere the this class was built, another one called WorkData() was coded first with general function to clean/modify data, transform outputs from multiple classes to binary classification and select and retrive data from the data set imported (for later use in the fitting and validation). The LDA class was designed to inherite methods and objects from the WorkData class.

In [16]:
### IMPORTING LIBRARIES AND METHODS ###

import numpy as np #NumPy library
import pandas as pd # Pandas library
import matplotlib.pyplot as plt # Matplot library
import random
import math

In [17]:
##############################################################################################################################
################################################            BEGINNING OF CLASS        ########################################

class WorkData(object):

    def __init__(self):
        """ Constructor for the general class to work with the data - Initializes empty list
        INPUT: Not required """
        self.Data_Set = []
        
    def Read_File(self, File_Name, separation = ",", header = 0):
        """ The function is designed to open a csv file with comma delimited as default as NumPy array.
        INPUT: name of file is passed from constructor and if no separator value is passed, it takes default value same for the header
        OUTPUT: function returns a NumPy arrray containing data """
        try:
            self.File_Name = File_Name
            separation = separation.lower()
            if(separation == "space" or separation == "tab"):
               separation = " "
            Data_Set = np.loadtxt(File_Name, delimiter=separation, skiprows = header)
            self.Data_Set = Data_Set
            return Data_Set
        except:
            print("READ_FILE_ERROR: error occured at function to read file of data set, look there.\tERROR_CODE:RF00000001PY\n")
    
    def To_Pandas_DF(self):
        """ The function is designed to open to convert NumPy array to Pandas DataFrame.
        INPUT: not required
        OUTPUT: function returns a Pandas' DataFrame containing data """
        try:
            self.Data_Set_DF = pd.DataFrame(self.Data_Set, index = (list(range(0,self.Data_Set.shape[0]))), columns = (list(range(0,self.Data_Set.shape[1]))))
        except:
            print("DATAFRAME_CONVERT_ERROR: error occured at function to convert Numpy structure to Pandas DataFrame, look there.\tERROR_CODE:DF00000001PY\n")
    
    def QQ_File_Stats(self, name = "QuartileStats_"):
        """ The function is designed to calculate general statistics in Pandas' DataFrame and report in quartiles.
        INPUT: Data imported as DataFrame and name for saving file - Default value = "QuartileStats_".
        OUTPUT: Function prints out stats """
        try:
            percent = [0.00, 0.25, 0.50, 0.75] #Describing the quartiles to be calculated
            quartiles = self.Data_Set_DF.describe(percentiles = percent)
            if(name == "QuartileStats_"):
                name = (name + self.File_Name)
            quartiles.to_csv(name) #Outputting csv file with statistics
            print(quartiles) #Printing statistics to screen
        except:
            print("PRINT_FILE_ERROR: error occured at function to print general stats from data set, look there.\tERROR_CODE:PF00000001PY\n")
            
    def DD_File_Stats(self, name = "DecileStats_"):
        """ The function is designed to calculate general statistics in Pandas' DataFrame and report in quartiles.
        INPUT: Data imported as DataFrame and name for saving file - Default value = "DecileStats_".
        OUTPUT: Function prints out stats """
        try:
            percent = [0.00, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00] #Describing the quartiles to be calculated
            deciles = self.Data_Set_DF.describe(percentiles = percent)
            if(name == "DecileStats_"):
                name = (name + self.File_Name)
            deciles.to_csv(name) #Outputting csv file with statistics
            print(deciles) #Printing statistics to screen
        except:
            print("PRINT_FILE_ERROR: error occured at function to print general stats from data set, look there.\tERROR_CODE:PF00000002PY\n")
            
    def Binary_Classification(self, index, specific = False, boundary = None, MinMax = None):
        """ The function is designed to add the a new column to the Output structure with the binary classification of values
        INPUT: index of column for classification and boundary - If value not passed, function will ask - or list type with value for 0 and 1.
        OUTPUT: DataFrame with binary classification added """
        try:
            if(specific == False and boundary == None):
                boundary = int(input("What is the threshold ultil which values are classified as 0? " ))
            elif(specific == False):
                for row in self.Data_Set: #Adding new column with binary classification by test the condition of the attribute being greater than 5 or not
                    if(row[index] <= boundary):
                        row[index] = 0
                    elif(row[index] > boundary):
                        row[index] = 1
            elif(specific == True):
                for row in self.Data_Set: #Adding new column with binary classification by test the condition of the attribute being greater than 5 or not
                    if(row[index] == MinMax[0]):
                        row[index] = 0
                    elif(row[index] == MinMax[1]):
                        row[index] = 1
            return self.Data_Set
        except:
            print("BINARY_DATA_HANDLING_ERROR: error occured at function to convert outputs to binary values, look there.\tERROR_CODE:BF00000001PY\n")

    def Get_Data(self, other, ListFeatures, ListExamples = None, numb = True):
        """ The function is designed to retrieve selected data columns.
        INPUT: Data imported and list type of indixes for features (columns) and another for examples 
                (if not passed retrieves all examples) to be selected and used of for training/validating - num = True DEFAULT.
        OUTPUT: Function retrives selected data """
        try:
            if(numb == False):
                if(ListExamples == None):
                    other.Data_Set = self.Data_Set[:,ListFeatures] #Retrieving all features
                else:
                    other.Data_Set = self.Data_Set[ListExamples,:][:,ListFeatures] #Retrieving only selected features
            else:
                if(ListExamples == None):
                    other.Data_Set = self.Data_Set[:,ListFeatures] #Retrieving all features
                else:
                    other.Data_Set = self.Data_Set[ListExamples,:][:,ListFeatures] #Retrieving only selected features
            Data = other.Data_Set
            return Data
        except:
            print("RETRIEVE_DATA_ERROR: error occured at function to retrive the data from object and pass it to variable, look there.\tERROR_CODE:SD00000001PY\n")

    def Get_Data_Internal(self, ListFeatures, ListExamples = None, numb = True):
        """ The function is designed to retrieve selected data, but for use withou ned of calling objects.
        INPUT: Data imported and list type of indixes for features (columns) and another for examples 
                (if not passed retrieves all examples) to be selected and training/validating - num = True DEFAULT.
        OUTPUT: Function retrives selected data """
        try:
            if(numb == False):
                if(ListExamples == None):
                    Data = self.Data_Set[:,ListFeatures] #Retrieving all features
                else:
                    Data = self.Data_Set[ListExamples,:][:,ListFeatures] #Retrieving only selected features
            else:
                if(ListExamples == None):
                    Data = self.Data_Set[:,ListFeatures] #Retrieving all features
                else:
                    Data = self.Data_Set[ListExamples,:][:,ListFeatures] #Retrieving only selected features
            return Data
        except:
            print("RETRIEVE_DATA_ERROR: error occured at function to filter data from data set internaly on the class, look there.\tERROR_CODE:SD00000001PY\n")

    def Binary_Values_Dist(self, LookFeatures, LookExamples = None, Show = False, numb = True):
        """ The function is designed to count binary values and get basic stats of binary class
        INPUT: Data to be analysed and list type with strings (numb = False) or integers (numb = True) for two columns - index0 = column upon which data will be count. and index1 = data to be counted (if they are the equal, same = False)
        OUTPUT: Function prints out stats """
        try:

            Temp_Vector1 = np.transpose(self.Get_Data_Internal(ListFeatures = LookFeatures, ListExamples = LookExamples, numb = numb)) #Getting vector of binary variables
            if(numb == False):
                print("FUNCTION RECEIVES ONLY NUMPY STRUCTURE")
            elif(numb == True):
                N_1 = (np.count_nonzero(Temp_Vector1 == 1)) #Counting numbers equal 1
                N_0 = ((Temp_Vector1.shape[1])-(np.count_nonzero(Temp_Vector1 == 1))) #Infering numbers equal 0
                Positive = (N_1/(N_1+N_0))
                Negative = (N_0/(N_1+N_0))
                if(Show == True):
                    print("The percentage of positive values is: ",round(float(Positive*100),2),"%")
                    print("The percentage of positive values is: ",round(float(Negative*100),2),"%")
        except:
            print("CALCULATION_ERROR: error occured at function to calculate percentages of binary classes, look there.\tERROR_CODE:DE00000001PY\n")

    def To_Screen(self):
        """ The function is designed to print objects to screen
        INPUT: None 
        OUTPUT: prints matrix/vector """
        try:
            print(self.Data_Set)
        except:
            print("PRINT_ERROR: error occured at function to print objects' values to screen, look there.\tERROR_CODE:PS00000001PY\n")
            
    def Get_Matrix(self):
        """ The function is designed to return DataFrame as a NumPy array if data set stored in DataFrame
        INPUT: None 
        OUTPUT: returns NumPy array/matrix """
        try:
            return self.Data_Set.to_numpy(copy = True)
        except:
            print("GET_MATRIX_ERROR: error occured at function to convert DataFrame to Numpy, look there.\tERROR_CODE:NP00000001PY\n")
    
    def Scatter_Plot(self, XY, Title = "MyScatterPlot", LabelX = "X axis", LabelY = "Y axis", LabelZ = None, ColorCode = "magma"):
        """ The function is designed to make scatter plots conditional to a third data value
        INPUT: name of the plot (default MyScatterPlot), list type with x and y (and z) indices, third column
                (default = blue), colormap (default magma) 
        OUTPUT: prints and save scatter plot graph """
        try:
            Matrix = np.transpose(self.Data_Set)
            print(Matrix)
            if(LabelZ == None):
                ColorCode = None
                Data = {LabelX:Matrix[XY[0]], LabelY:Matrix[XY[1]]}
            else:
                Data = {LabelX:Matrix[XY[0]], LabelY:Matrix[XY[1]], LabelZ:Matrix[XY[2]]}
            
            fig, ax = plt.subplots(figsize = [15,10])
            ax.grid(True)
            plt.scatter(LabelX, LabelY, c = LabelZ, s = 100, data = Data, cmap = ColorCode, marker = 'o')
            plt.suptitle(Title,fontsize = 26)
            plt.xlabel(LabelX, fontsize = 20)
            plt.ylabel(LabelY, fontsize = 20)
            plt.show()
            plt.savefig(Title+".png") #Saves as a PNG file in the directory folder
        except:
            print("SAVE_PLOT_ERROR: error occured at plotting function, look there.\tERROR_CODE:SP00000001PY\n")

########################################################################################################################
################################################            END OF CLASS        ########################################

In [18]:
##############################################################################################################################
################################################            BEGINNING OF CLASS        ########################################

class LDA_Model(WorkData):
    
    def __init__(self, Entire_Data_Set, Training_Features, Training_Outputs, Training_Examples = None,):
        """ Constructor for the Linear Discriminant Analysis class - Initializes with examples NumPy array to store data,
            list type with indices for Output column, list with indices for the chosen features and list of 
            chosen examples to train the model
        INPUT: NumPy array and three lists """
        WorkData.__init__(self)
        self.Data_Set = Entire_Data_Set #Data set after preprocessing
        self.Train_Examples = Training_Examples
        self.Train_Output = Training_Outputs
        self.Train_Features = Training_Features
#        print(self.Data_Set)
#        print(self.Train_Output)
#        print(self.Train_Features)

    def Prob_Classes(self, show = False, numb = True):
        """ The function is designed to look through binary classes and return the P(y=1) and P(y=0) probabilities
        INPUT: boolean flagging to Show (=True) or not (=False) the partial results and another flagging use of 
                strings (numb = False) or integer indexes (numb = True) for the column containing the examples and features to be used to be used - LAST ONE NOT IMPLEMENTED YET
        OUTPUT: Function returns P(y=1) and P(y=0) probabilities """
        try:
            self.Output_Selected_Training = self.Get_Data_Internal(self.Train_Output, self.Train_Examples, numb = True) #Retrieving outputs used for fitting the model
            Temp_Vector1 = np.transpose(self.Output_Selected_Training) #Getting vector of binary variables
            Array = np.array(Temp_Vector1)
  
            if(numb == False):
                print("FUNCTION RECEIVES ONLY NUMPY STRUCTURE")
            elif(numb == True):
                #Calculating number of outputs from class 1
                N_1 = (np.count_nonzero(Array == 1)) #Counting numbers equal 1
                self.N_1 = N_1               
                #Calculating number of outputs from class 0
                N_0 = ((Array.shape[1])-(np.count_nonzero(Array == 1))) #Infering numbers equal 0
                self.N_0 = N_0
                #Calculating number of outputs from both classes
                N_T = (N_1+N_0)
                self.N_T = N_T            
                self.Prob_Y1 = (self.N_1/self.N_T) #Assigning probability P(y=1)
                self.Prob_Y0 = (self.N_0/self.N_T) #Assigning probability P(y=0)
                
                if(show == True):
                    print("The probability P(y=1) = ",float(self.Prob_Y1))
                    print("The probability P(y=1) = ",float(self.Prob_Y0))
            Prob_Y0 = self.Prob_Y0
            Prob_Y1 = self.Prob_Y1
            return Prob_Y0, Prob_Y1
        except:
            print("CALCULATION_ERROR: error occured at the classes's probability function, look there.\tERROR_CODE:PROB_FUNC0001\n")
            
    def Mean_Classes (self, show = False, numb = True):
        """ The function is designed to calculate the mean of the binary classes and return MuK
        INPUT: boolean flagging to Show (=True) or not (=False) the partial results and another flagging use of strings (numb = False)
                or integer indexes (numb = True) for the column containing the examples and features to be used to be used - LAST ONE NOT IMPLEMENTED YET
        OUTPUT: Function returns mean Mu of class k """
        try:
            self.Features_Selected_Training = self.Get_Data_Internal(self.Train_Features, self.Train_Examples, numb = True)
            Features_Trans = np.transpose(self.Features_Selected_Training) #2D list horizontal
            Outputs_Trans = np.transpose(self.Output_Selected_Training) #2D list horizontal
            if(numb == False): #If the list of columns has strings
                print("FUNCTION RECEIVES ONLY NUMPY STRUCTURE")
            
            elif(numb == True): #If the list of columns has index numbers
                Mu_1 = [] #Declaring vector for class 1 means
                Mu_0 = [] #Declaring vector for class 0 means
                for feature in range(len(Features_Trans)): #Calculating the means of each class for each feature
                    TempMu0 = 0
                    TempMu1 = 0
                    for example in range(len(Features_Trans[feature])): #Calculating summation of features multiplied by indicator function, where indicator function replaced by conditional
                        if(Outputs_Trans[0][example] == 0):
                            TempMu0 += (Features_Trans[feature][example] / self.N_0)
                        elif(Outputs_Trans[0][example] == 1):
                            TempMu1 += (Features_Trans[feature][example] / self.N_1)
                    
                    Mu_0.append(TempMu0) #Creating a vector with size of number of features m for class 1
                    Mu_1.append(TempMu1) #Creating a vector with size of number of features m for class 0
                if(show == True):
                    print("Vector of means Mu_0 = ",Mu_0)
                    print("Vector of means Mu_1 = ",Mu_1)
                self.Mu_k = [Mu_0, Mu_1]
            return Mu_0, Mu_1
        except:
            print("CALCULATION_ERROR: error occured at the classes's means function, look there.\tERROR_CODE:MEAN_FUNC0001\n")
            
    def Covariance_Matrix(self, show = False, numb = True):
        """ The function is designed to calculate covariance matrix shared by the two different classes
        INPUT: boolean flagging to Show (=True) or not (=False) the partial results and another flagging use of strings (numb = False)
                or integer indexes (numb = True) for the column containing the examples and features to be used to be used - LAST ONE NOT IMPLEMENTED YET
        OUTPUT: Function returns the covariance matrix shared by both classes """
        try:
            Features_Selected = self.Features_Selected_Training #2D array horizontal
            Output_Selected = np.ravel( np.transpose( self.Output_Selected_Training ) ) #1D array horizontal
            
            print(Features_Selected)
            print(Output_Selected)
            
            if(numb == False): #If the list of columns has strings
                print("FUNCTION RECEIVES ONLY NUMPY STRUCTURE")

            elif(numb == True): #If the list of columns has index numbers
                
                Temp_Matrix = np.zeros([len(Features_Selected[0]),len(Features_Selected[0])]) #Declaring Matrix with zeros to perform the summation
           
                for k in range(len(self.Mu_k)):
                    for i in range(len(Features_Selected)):
                        if(Output_Selected[i] == 0):
                            Xi_Muk = [Features_Selected[i] - self.Mu_k[0]]
                            Xi_Muk_T = np.transpose(Xi_Muk)
                            mXm = np.dot(Xi_Muk_T,Xi_Muk)
                            mXm_Div = mXm * (1/(self.N_T - 2))
                            
                            Temp_Matrix = Temp_Matrix + mXm_Div
                
                if(show == True):
                    print("Covariance matrix Sigma = ",Temp_Matrix)

                self.Matrix_Cov = Temp_Matrix
                return Temp_Matrix
        except:
            print("CALCULATION_ERROR: error occurred at the covariance matrix function, look at it.\tERROR_CODE:COVA_FUNC0001\n")
    
    def Fit_Model(self, Show = False, Numb = True):
        """ The function was designed to call all other functions (Probabilities, Means and Covariance Matrix) and fit the model with the data initialized in the constructor
        INPUT: boolean flagging to Show (=True) or not (=False) the partial results and another flagging use of strings (numb = False)
                or integer indexes (numb = True) for the column containing the examples and features to be used to be used - LAST ONE NOT IMPLEMENTED YET
        OUTPUT: Function returns Wo term to be used in the Prediction_Function """
        try:
            P0, P1 = self.Prob_Classes(show = Show, numb = Numb) #Scalar
            M_0, M_1 = self.Mean_Classes(show = Show, numb = Numb) #Scalar
            Covar = self.Covariance_Matrix(show = Show, numb = Numb) #2D array horizonta
            M0 = [M_0] #2D array horizontal
            M1 = [M_1] #2D array horizontal

            Log_Odds = np.log((self.Prob_Y1/self.Prob_Y0)) #Scalar
            Part2 = np.ravel( np.dot( (np.dot(M1,np.linalg.inv(Covar))) , np.transpose(M1) ) ) #1x1 array unit
            Part3 = np.ravel( np.dot( (np.dot(M0,np.linalg.inv(Covar))) , np.transpose(M0) ) ) #1x1 array unit
            Part4 = np.ravel( - (Part2 * (1/2)) + (Part3 * (1/2)) )
            print(Part4)
            Bias_Wo = Log_Odds + Part4
            self.Bias_Wo = Bias_Wo #1x1 array unit
            if(Show == True):
                print("The value of bias weight Wo = ", Bias_Wo)
        except:
            print("TRAINING_ERROR: error occurred at the Fit_Model function, look at it.\tERROR_CODE:FIT_FUNC0001\n")
        
    
    def Predict_Function(self, Validation_Data_Features, Validation_Data_Examples, Validation_Data_Outputs, Show=False, NUMB = True):
        """ The function was designed to call fitted model and predict outputs for the inputs
        INPUT: boolean flagging to Show (=True) or not (=False) the partial results and another flagging use of strings (numb = False)
                or integer indexes (numb = True) for the column containing the examples and features to be used to be used - LAST ONE NOT IMPLEMENTED YET
        OUTPUT: Function returns predictions """
        try:
            #Retrieving validation data
            Features_Selected_Validation = self.Get_Data_Internal( Validation_Data_Features, Validation_Data_Examples, numb = NUMB) #2D array horizontal
            Outputs_Selected_Validation = np.ravel( np.transpose( self.Get_Data_Internal( Validation_Data_Outputs, Validation_Data_Examples, numb = NUMB) ) ) #1D array horizontal

            Mu_0 = [self.Mu_k[0]] #2D list horizontal
            Mu_1 = [self.Mu_k[1]] #2D list horizontal
        
            Bias_Wo = self.Bias_Wo #1x1 array unit
            Bias_Wo_Unit = Bias_Wo[0]
            Sigma = np.linalg.inv(self.Matrix_Cov) #2D array horizontal
            Delta_Mu_T = np.transpose( np.subtract( Mu_1, Mu_0 ) ) #2D list vertical

            Weights = np.dot(Sigma, Delta_Mu_T) #2D array vertical
            if(Show == True):
                print("The fitted weights were W = ")
                print(Weights)

            Predictions = []
            Values = []
            TrueValue = []
            Acertation = []
        
            """ PERFORMING VALIDATION AND GENERATING PREDICTIONS """
            for example in range(len(Features_Selected_Validation)):
                Temp = np.ravel( np.dot( [Features_Selected_Validation[example]], Weights ) ) #1x1 array unit
                Values.append(Bias_Wo_Unit + Temp[0])
                if((Bias_Wo_Unit + Temp[0]) > 0.0):
                    Predictions.append(1.0)
                else:
                    Predictions.append(0.0)
                TrueValue.append(Outputs_Selected_Validation[example])
            
            """ CALCULATING ACCURACY OF THE FITTED MODEL WITH THE SELECTED FEATURES """
            for i in range(len(Predictions)):
                if(Predictions[i] == TrueValue[i]):
                    Acertation.append(1.0)
                else:
                    Acertation.append(0.0)
            Accur = np.array(Acertation)
        
            """ PRINTING RESULTS """ 
            print("\nThe results for the model are:")
#           print(Values)
            print("\nPredictions made: *y = ",Predictions)
            print("\nActual values Y = ",TrueValue)
#           print(Acertation)
            print("\nAccuracy of the model is",round(float((((np.count_nonzero(Accur == 1))/len(Accur))*100)),4),"%")
            
        except:
            print("VALIDATION_ERROR: error occurred at the Predict_Function, look at it.\tERROR_CODE:FIT_FUNC0001\n")

########################################################################################################################
################################################            END OF CLASS        ########################################

Objects and methods were called sequentialy

### RED WINE DATA

In [19]:
FileName = "winequality_red.csv"

RedWine = WorkData() #Calling constructor
RedWine_Binary = WorkData() #Allocating memory for the data

RedWine.Read_File(File_Name=FileName, separation=";", header=1) #Reading file and importing data
RedWine.Binary_Classification(index=11, specific=False, boundary=5, MinMax=None) #Replacing the class by binary values

RedWine.To_Pandas_DF()
RedWine.QQ_File_Stats() #General stats on the data
RedWine.Binary_Values_Dist(LookFeatures=[11], LookExamples=None, Show=True) #Binary distribution

                0            1            2            3            4   \
count  1599.000000  1599.000000  1599.000000  1599.000000  1599.000000   
mean      8.319637     0.527821     0.270976     2.538806     0.087467   
std       1.741096     0.179060     0.194801     1.409928     0.047065   
min       4.600000     0.120000     0.000000     0.900000     0.012000   
0%        4.600000     0.120000     0.000000     0.900000     0.012000   
25%       7.100000     0.390000     0.090000     1.900000     0.070000   
50%       7.900000     0.520000     0.260000     2.200000     0.079000   
75%       9.200000     0.640000     0.420000     2.600000     0.090000   
max      15.900000     1.580000     1.000000    15.500000     0.611000   

                5            6            7            8            9   \
count  1599.000000  1599.000000  1599.000000  1599.000000  1599.000000   
mean     15.874922    46.467792     0.996747     3.311113     0.658149   
std      10.460157    32.895324     0

In [20]:
#RedWine_Update contains the updated matrix with binary values and it will be used in the Fitting function
RedWine_Update = RedWine.Get_Data(RedWine_Binary, ListFeatures = list(range(0,12)), ListExamples = None) 

#Calling LDA Class with updated matrix
RedWineLDA = LDA_Model(Entire_Data_Set=RedWine_Update, Training_Features=list(range(0,11,2)), Training_Outputs=[11], Training_Examples=list(range(0,20,1)))

RedWineLDA.Fit_Model(Show = True, Numb = True) #Calling fit function

#Calling Predict_Function to validate the model
RedWineLDA.Predict_Function(Validation_Data_Features=list(range(0,11,2)), Validation_Data_Examples=[5, 250, 500, 750, 1000, 1500, 1598], Validation_Data_Outputs=[11], Show=True, NUMB=True)

The probability P(y=1) =  0.25
The probability P(y=1) =  0.75
Vector of means Mu_0 =  [7.606666666666667, 0.128, 0.11520000000000001, 68.19999999999999, 3.328666666666666, 9.54]
Vector of means Mu_1 =  [8.54, 0.33000000000000007, 0.1292, 51.60000000000001, 3.25, 9.799999999999999]
[[7.40e+00 0.00e+00 7.60e-02 3.40e+01 3.51e+00 9.40e+00]
 [7.80e+00 0.00e+00 9.80e-02 6.70e+01 3.20e+00 9.80e+00]
 [7.80e+00 4.00e-02 9.20e-02 5.40e+01 3.26e+00 9.80e+00]
 [1.12e+01 5.60e-01 7.50e-02 6.00e+01 3.16e+00 9.80e+00]
 [7.40e+00 0.00e+00 7.60e-02 3.40e+01 3.51e+00 9.40e+00]
 [7.40e+00 0.00e+00 7.50e-02 4.00e+01 3.51e+00 9.40e+00]
 [7.90e+00 6.00e-02 6.90e-02 5.90e+01 3.30e+00 9.40e+00]
 [7.30e+00 0.00e+00 6.50e-02 2.10e+01 3.39e+00 1.00e+01]
 [7.80e+00 2.00e-02 7.30e-02 1.80e+01 3.36e+00 9.50e+00]
 [7.50e+00 3.60e-01 7.10e-02 1.02e+02 3.35e+00 1.05e+01]
 [6.70e+00 8.00e-02 9.70e-02 6.50e+01 3.28e+00 9.20e+00]
 [7.50e+00 3.60e-01 7.10e-02 1.02e+02 3.35e+00 1.05e+01]
 [5.60e+00 0.00e+00 8.90e-02 5.90e

### Breast Cancer Data Set

In [21]:
FileName = "breast-cancer-wisconsin.data"

BreastCancer = WorkData()
BreastCancer_Binary = WorkData()

BreastCancer.Read_File(File_Name=FileName, separation=",", header=0)
BreastCancer.Binary_Classification(index=10, specific=True, boundary=None, MinMax=[2,4])

BreastCancer.To_Pandas_DF()
BreastCancer.QQ_File_Stats()

BreastCancer.Binary_Values_Dist(LookFeatures=[10], LookExamples=None, Show=True)

                 0           1           2           3           4   \
count  6.830000e+02  683.000000  683.000000  683.000000  683.000000   
mean   1.076720e+06    4.442167    3.150805    3.215227    2.830161   
std    6.206440e+05    2.820761    3.065145    2.988581    2.864562   
min    6.337500e+04    1.000000    1.000000    1.000000    1.000000   
0%     6.337500e+04    1.000000    1.000000    1.000000    1.000000   
25%    8.776170e+05    2.000000    1.000000    1.000000    1.000000   
50%    1.171795e+06    4.000000    1.000000    1.000000    1.000000   
75%    1.238705e+06    6.000000    5.000000    5.000000    4.000000   
max    1.345435e+07   10.000000   10.000000   10.000000   10.000000   

               5           6           7           8           9           10  
count  683.000000  683.000000  683.000000  683.000000  683.000000  683.000000  
mean     3.234261    3.544656    3.445095    2.869693    1.603221    0.349927  
std      2.223085    3.643857    2.449697    3.05

In [22]:

BreastCancer_Update = BreastCancer.Get_Data(BreastCancer_Binary, ListFeatures = list(range(0,11)), ListExamples = None)

BreastCancerLDA = LDA_Model(Entire_Data_Set=BreastCancer_Update, Training_Features=list(range(1,10,2)), Training_Outputs=[10], Training_Examples=list(range(0,150,2))) #Calling LDA Class with updated matrix

BreastCancerLDA.Fit_Model(Show = True, Numb = True)


BreastCancerLDA.Predict_Function(Validation_Data_Features=list(range(1,10,2)), Validation_Data_Examples=[5, 50, 100, 250, 300, 450, 500], Validation_Data_Outputs=[10], Show=True, NUMB=True)

The probability P(y=1) =  0.4266666666666667
The probability P(y=1) =  0.5733333333333334
Vector of means Mu_0 =  [2.8372093023255824, 1.4186046511627912, 2.1162790697674425, 2.813953488372094, 1.1162790697674425]
Vector of means Mu_1 =  [7.28125, 6.375, 5.65625, 4.90625, 2.78125]
[[ 5.  1.  2.  3.  1.]
 [ 3.  1.  2.  3.  1.]
 [ 4.  1.  2.  3.  1.]
 [ 1.  1.  2.  3.  1.]
 [ 2.  1.  2.  1.  5.]
 [ 1.  1.  1.  3.  1.]
 [ 5.  3.  2.  4.  1.]
 [ 8.  5.  7.  5.  4.]
 [ 4.  1.  2.  2.  1.]
 [10.  7.  4.  4.  2.]
 [ 7.  2.  5.  5.  4.]
 [ 3.  1.  2.  2.  1.]
 [ 5.  3.  2.  3.  1.]
 [ 5.  1.  2.  2.  1.]
 [ 1.  3.  2.  1.  1.]
 [ 2.  1.  2.  3.  1.]
 [ 2.  1.  2.  3.  1.]
 [ 2.  1.  2.  2.  1.]
 [ 6.  1.  1.  7.  1.]
 [ 2.  3.  6.  7.  1.]
 [ 6. 10.  8.  7.  3.]
 [10. 10.  8.  8.  1.]
 [ 3.  7.  4.  4.  1.]
 [ 4.  1.  2.  3.  1.]
 [ 9.  8.  2.  2.  5.]
 [10.  6.  3.  4.  2.]
 [10.  5.  8.  7.  1.]
 [ 8. 10.  3.  3.  1.]
 [ 5.  3.  6.  5.  1.]
 [ 5.  5.  3.  4.  1.]
 [ 9. 10. 10.  3.  1.]
 [ 1.