In [1029]:
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn import model_selection
from sklearn.metrics import  accuracy_score
from numpy.linalg import inv
import pandas as pd
import math
import re
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline
from sklearn.model_selection import train_test_split
import nltk 
from sklearn.feature_extraction.text import TfidfTransformer
from collections import defaultdict
from nltk.corpus import stopwords

## Data exploration and Features Engineering

In [1030]:
#data reading and concatanation
data_train=pd.read_table('drugLibTrain_raw.tsv')
data_test=pd.read_table('drugLibTest_raw.tsv')
data=pd.concat([data_train,data_test])
data.head()

Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above


In [1031]:
# concatanatio of columns in order to get on text column
message= data['urlDrugName'] + " " + data['effectiveness'] + " " + data['sideEffects'] 
message

0            enalapril Highly Effective Mild Side Effects
1       ortho-tri-cyclen Highly Effective Severe Side ...
2                ponstel Highly Effective No Side Effects
3         prilosec Marginally Effective Mild Side Effects
4         lyrica Marginally Effective Severe Side Effects
                              ...                        
1031    accutane Considerably Effective Severe Side Ef...
1032          proair-hfa Highly Effective No Side Effects
1033    accutane Considerably Effective Moderate Side ...
1034             divigel Highly Effective No Side Effects
1035    claripel-cream Considerably Effective Mild Sid...
Length: 4143, dtype: object

In [1032]:
message.shape

(4143,)

In [1043]:
# transofrmation onto a dataframe and NAN values
doc= pd.DataFrame(message,columns=['message'])
doc['rating']=data['rating']
doc.dropna(axis=0,inplace=True)

In [1044]:
# Function to clean data before processing
def preprocess_string(str_arg):
        cleaned_str=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE) #every char except alphabets is replaced
        cleaned_str=re.sub('(\s+)',' ',cleaned_str) #multiple spaces are replaced by single space
        cleaned_str=cleaned_str.lower() #converting the cleaned string to lower case
        return cleaned_str

In [1045]:
# apply process_string function to clean our data
doc['message'].head(5).apply(preprocess_string)

0         enalapril highly effective mild side effects
1    ortho tri cyclen highly effective severe side ...
2             ponstel highly effective no side effects
3      prilosec marginally effective mild side effects
4      lyrica marginally effective severe side effects
Name: message, dtype: object

In [1046]:
doc.head()

Unnamed: 0,message,rating
0,enalapril Highly Effective Mild Side Effects,4
1,ortho-tri-cyclen Highly Effective Severe Side ...,1
2,ponstel Highly Effective No Side Effects,10
3,prilosec Marginally Effective Mild Side Effects,3
4,lyrica Marginally Effective Severe Side Effects,2


In [1047]:
#check if there is any null value
doc['message'].isnull().values.any()

False

In [1048]:
# tokenizing text column with countVectorizer
bow_transformer = CountVectorizer(analyzer=preprocess_string).fit(doc['message'])
# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

27


In [1049]:
#message column after transformation
messages_bow = bow_transformer.transform(doc['message'])

In [1050]:
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

Shape of Sparse Matrix:  (4143, 27)
Amount of Non-Zero occurences:  71665


In [1051]:
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

sparsity: 64


In [1052]:
#TFIDF application with word frequency representation
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(4143, 27)


In [1053]:
#transformation into a dataframe 
X1 = pd.DataFrame(messages_tfidf.toarray(), columns=bow_transformer.get_feature_names())

In [1054]:
X=doc['message']
y=doc['rating']

In [1055]:
# cross validation function for model selection and validation
def k_fold_cross_validation_sets(X, y, k, shuffle=True):
    """ Split the data into k sets of training / test data """
    if shuffle:
        X, y = shuffle_data(X, y)

    n_samples = len(y)
    left_overs = {}
    n_left_overs = (n_samples % k)
    if n_left_overs != 0:
        left_overs["X"] = X[-n_left_overs:]
        left_overs["y"] = y[-n_left_overs:]
        X = X[:-n_left_overs]
        y = y[:-n_left_overs]

    X_split = np.split(X, k)
    y_split = np.split(y, k)
    sets = []
    for i in range(k):
        X_test, y_test = X_split[i], y_split[i]
        X_train = np.concatenate(X_split[:i] + X_split[i + 1:], axis=0)
        y_train = np.concatenate(y_split[:i] + y_split[i + 1:], axis=0)
        sets.append([X_train, X_test, y_train, y_test])

    # Add left over samples to last set as training samples
    if n_left_overs != 0:
        np.append(sets[-1][0], left_overs["X"], axis=0)
        np.append(sets[-1][2], left_overs["y"], axis=0)

    return np.array(sets)

def shuffle_data(X, y, seed=None):
    """ Random shuffle of the samples in X and y """
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]

In [1056]:
## function to divid dataset in X %
def divid_dataset(x,y,pourcentage):
    x1 = x.sample(frac=pourcentage)
    y1 = y.sample(frac=pourcentage)
    return x1,y1

## Applications: Naive Bayes, Gaussian Discriminant Analysis and Comparison with Logistic Regression

### 1. Naive Bayes

In [1057]:
class NaiveBayes:
    
    def __init__(self,unique_labels):
        self.classes = unique_labels #number of class 
    
    # add sentence to bag of words
    def addSentToBow(self,example_sentence,bow_dict_index):
        ## example_sentance : is the sentence we pass to test/train
        ## bow_dict_index : index of each bag of word corresponding to example_sentence
        # tokenize every sentence and add it to the dictionnary
        
        #checking if example sent is of type nd array i.e array([]) and getting the sentence
        if(isinstance(example_sentence,np.ndarray)):
            example_sentence = example_sentence[0]
        for word in example_sentence.split():
           ## creating dic {word1: freq1,....,wordn:freqn}
            self.bow_label_dict[bow_dict_index][word]+=1
            
    
    def train(self,X,y):
        self.data = X
        self.labels=y
        # Creating dict of n dicts, where n is no.of classes
        self.bow_label_dict = np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])])
        
        # converting data to numpy arrays
        if not isinstance(self.data,np.ndarray): 
            self.data=np.array(self.data)
        if not isinstance(self.labels,np.ndarray): 
            self.labels=np.array(self.labels)
        
        #Creating bag of words for each Class
        #enumerate method will return tuple (counter,element in collection)
        for index,label in enumerate(self.classes):
            partticular_label = self.data[self.labels==label]
            
            #apply process function to clean every row of text
            cleaned_sent = [preprocess_string(sent) for sent in partticular_label]
            cleaned_sent=pd.DataFrame(data=cleaned_sent)
            
            #now we can construct Bag of words of each particular label
            np.apply_along_axis(self.addSentToBow,1,cleaned_sent,index)
            
            '''
            Now we need calculate the terms of our formula:
            -> Prior Probability of each class - P(c) -> (no.of sentences belongning to class c)/(total no.of sent)
            -> Word Corpus/ Vocabulary |V|
            '''   
        # Creating an array to store the probability of sentence for each class
        proba_label = np.empty(self.classes.shape[0])   
        all_words = []
        count_of_words_in_each_class = np.empty(self.classes.shape[0])
            
        for index, label in enumerate(self.classes):
            #Prior probability P(c) for each class
            proba_label[index] = np.sum(self.labels==label)/float(self.labels.shape[0])
            
            # getting total counts of all words in each class
            count = list(self.bow_label_dict[index].values())
            
            count_of_words_in_each_class[index] = np.sum(np.array(count))+1
            
            all_words+=self.bow_label_dict[index].keys()
            
        # now combining all the words of all the classes to get |V|
        self.vocab = np.unique(np.array(all_words))
        
        self.lenOfVocab = self.vocab.shape[0]
        
        #Computing denominators of each class i.e (count(c) + |v| + 1) i.e
        # count of words of that class + Vocab|v|(adding this to avoide 0 probability 
        # when word is unknown )+1 :  Laplace Smoothing
        denom = np.array([count_of_words_in_each_class[index]+self.lenOfVocab+1 for index,label in enumerate(self.classes)])
        
        # putting pieces of formula in organized way in this way (bow dict of each class,prior probability,denominator)
        self.classes_info = [(self.bow_label_dict[index],proba_label[index],denom[index]) for index,labels in enumerate(self.classes)]
        self.classes_info = np.array(self.classes_info)
    
    
    def getPosteriorProba(self,test_sent):
        
        # To store probabilities of each class
        likelihood_prob = np.zeros(self.classes.shape[0])
        
        # Calculating Probabilities to each label/class
        # Formula -> (count of word class c +1)/(count wof words in that class + vocab_len + 1)
        for index,labels in enumerate(self.classes):
            
            for word in test_sent.split():
                
                # numerator in the above formula
                # adding +1 to get rid of zero probability i.e even when the word is not present in our training vocab
                # then also probability will not be zero as we are adding 1 it will be 1/something.
                word_count = self.classes_info[index][0].get(word,0)+1
                word_proba = word_count/float(self.classes_info[index][2])
                
                # log application of each word to compute easier the probability
                likelihood_prob[index]+=np.log(word_proba)
                
        post_proba = np.empty(self.classes.shape[0])
        for index,labels in enumerate(self.classes):
            post_proba[index] = likelihood_prob[index]+np.log(self.classes_info[index][1])
        
        return post_proba;
    
    
    # predict with new sentence text
    def predict(self,test_data):
        
        pred = []
        
        for sent in test_data:
            
            cleaned_sent = preprocess_string(sent)
            
            post_proba = self.getPosteriorProba(cleaned_sent)
            
            pred.append(self.classes[np.argmax(post_proba)])
        return np.array(pred)

In [1058]:
train_data,test_data,train_labels,test_labels=train_test_split(X,y,shuffle=True,test_size=0.2,random_state=42,stratify=y)
classes=np.unique(train_labels)

In [1059]:
nb1 = NaiveBayes(classes)
nb1.train(train_data,train_labels)

In [1060]:
proba_sent1=nb1.predict(test_data)
#accuracy
test_acc=np.sum(proba_sent1==test_labels)/float(len(test_labels)) 

print ("Test Set Accuracy: ",test_acc*100,"%") 

Test Set Accuracy:  45.59710494571773 %


## 2. Gaussian Discriminant Analysis

In [1065]:
#implementating Gaussian Discriminant Analysis
class GDA():
    def __init__(self):
        self.__phi = None
        self.__means = None
        self.__sigma = None
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.__classes = np.unique(y)
        n_classes = len(self.__classes)
        
        self.__phi = np.zeros((n_classes, 1))
        self.__means = np.zeros((n_classes, n_features))
        self.__sigma = 0
        for i in range(n_classes):
            indexes = np.flatnonzero(y == self.__classes[i])

            self.__phi[i] = len(indexes) / n_samples
            self.__means[i] = np.mean(X[indexes], axis=0)
            self.__sigma += np.cov(X[indexes].shape) * (len(indexes) - 1)
            #self.__sigma += np.cov(X[indexes].T) * (len(indexes) - 1)

            self.__sigma /= n_samples
                    
    def calculate_px_py(self,X,u,sigma):
        n = X.shape[0]
        pi = 3.14
        x1=1/((2*np.pi)**(n/2)*np.linalg.det(np.abs(sigma)))
        D=np.matrix(X-u)
        x2=np.exp(-np.dot(np.matmul(D,np.linalg.inv(sigma)),D))[1]
        return x1*x2 
    
    def compute_Pxyi(self, X, idx):
        """Probability of X given y"""
        m = X.shape[1]
        from scipy.stats import multivariate_normal
        sigma_inv = np.linalg.inv(self.sigma[idx])
        det_sigma = np.linalg.det(self.sigma[idx])
        #mu_i = mu(X, y, idx)
        Pxi = (1/((2*np.pi)**(m/2))) \
                *(1/(det_sigma**0.5)) \
                * np.exp(- 0.5*np.sum(((X-self.mu[idx])@sigma_inv)*(X-self.mu[idx]), axis=1))
    #     Pxi = np.log(1) \
    #             - np.log((2*np.pi)**(m/2)) \
    #             - np.log(np.sqrt(det_sigma)) \
    #             - np.sum(((X-mu_i)@sigma_inv)*(X-mu_i), axis=1)
        return Pxi
    
    def calculate_py(self,y):
        return np.where(y==1,self.__phi,1-self.__phi)                              
        
    def predict(self, X):
        #pdf= calculate_px_py(X,mean,self.__sigma)
        pdf = lambda mean: multivariate_normal.pdf(X, mean=mean, cov=self.__sigma)
        y_probs = np.apply_along_axis(pdf, 1, self.__means) * self.__phi

        return self.__classes[np.argmax(y_probs, axis=0)]
   

In [1066]:
tr_gda,tst_gda,lbls_gda,ltst_gda=train_test_split(X1,y,shuffle=True,test_size=0.2,random_state=42,stratify=y)

In [1068]:
model = GDA()
model.fit(tr_gda.values,lbls_gda)
predicted = model.predict(tst_gda.values)
#accuracy
test_acc=np.sum(predicted==ltst_gda)/float(len(ltst_gda)) 
print ("Test Set Accuracy: ",test_acc*100,"%") 

Test Set Accuracy:  23.401688781664657 %


## 3. Logistic Regression 

In [1023]:
class LogisticRegressionClass():
    def __init__(self, learning_rate=0.01, stopping_criterion=0.01, max_iterations=1000, max_epochs=10):
        self.learning_rate = learning_rate
        self.stopping_criterion = stopping_criterion
        self.max_iterations = max_iterations
        self.max_epochs = max_epochs
        self.w = None
    
    def softmax(z):
        z -= np.max(z)
        sm = (np.exp(z).T / np.sum(np.exp(z))).T
        return sm
    
    def MultinomialRegression_fit(self, x, y):
        if(x.shape[0]!=y.shape[0]):
            #print ("Number of examples of features and outputs don't match")
            return
        
        m = x.shape[0]
        n = x.shape[1]
        k = y.shape[1]
        
        self.w = np.random.randn(k,n)
        gradient = np.zeros((k,n))
        cost = np.zeros(self.max_epochs)
        
        for epoch in range(0,self.max_epochs):
            #print (epoch)
            for i in range(0,m):
                y_hat = softmax(np.reshape(np.matmul(self.w,x[i,:]),k))
                for j in range(0,k):
                    gradient = (y_hat[j]-y[i,j])*x[i,:]
                    self.w[j,:]-=self.learning_rate*gradient
                    cost[epoch]-=y[i,j]*np.log(y_hat[j])
        
#         plt.plot(cost)
        return self.w
    
    def MultinomialRegression_predict(self, x):
        m = x.shape[0]
        n = x.shape[1]
        
        k = self.w.shape[0]
        y = np.zeros((m,k))
        
        for i in range(0,m):
            y[i,:] = softmax(np.reshape(np.matmul(self.w,x[i,:]),k))
        
            y[y >= 0.5 ] = 1
            y[y < 0.5 ] = 0
        return  y
    
    def evaluate(self,test_data,labels):
        accuracy = accuracy_score(self.MultinomialRegression_predict(test_data),labels)
        return accuracy

## Comparison of differents algorithms

## 1.    10 % of the dataset

In [985]:
# 10 % of the dataset and split into train and test set
data1,label1=divid_dataset(X1,y,0.1)
train_data1,test_data1,train_labels1,test_labels1=train_test_split(data1,label1,shuffle=True,test_size=0.2,random_state=42,stratify=label1)

 Naive Bayes 

In [976]:
data11,label11=divid_dataset(X,y,0.1)
train_data,test_data,train_labels,test_labels=train_test_split(data11,label11,shuffle=True,test_size=0.25,random_state=42,stratify=label11)
classes=np.unique(train_labels)
model = NaiveBayes(classes)
model.train(train_data,train_labels)
predicted = model.predict(test_data)
#accuracy
nb1_test_acc=np.sum(predicted==test_labels)/float(len(test_labels)) 
print ("Test Set Accuracy for Naive Bayes: ",nb1_test_acc*100,"%") 

Test Set Accuracy for Naive Bayes:  28.846153846153843 %


Gaussian Discriminant Analysis

In [1025]:
model = GDA()
model.fit(train_data1.values,train_labels1)
predicted = model.predict(test_data1)
#accuracy
gda_test_acc=np.sum(predicted==test_labels1)/float(len(test_labels1)) 
print ("Test Set Accuracy for Gaussian Discriminant Analysis: ",gda_test_acc*100,"%") 

Test Set Accuracy for Gaussian Discriminant Analysis:  22.89156626506024 %


Logistic Regression

In [846]:
label12 = (np.arange(len(np.unique(label1))) == label1[:, None]).astype(float)
train_data12,test_data12,train_labels12,test_labels12=train_test_split(data1,label12,shuffle=True,test_size=0.25,random_state=42,stratify=label12)

In [847]:
classifier = LogisticRegressionClass()
classifier.max_epochs=100
w_predicted = classifier.MultinomialRegression_fit(train_data12.values,train_labels12)
y_hat = classifier.MultinomialRegression_predict(test_data12.values)
#accuracy
lr_test_acc=classifier.evaluate(test_data12.values,test_labels12)
print ("Test Set Accuracy for Logistic regression: ",lr_test_acc.mean()*100,"%") 

Test Set Accuracy for Logistic regression:  22.115384615384613 %


Comparison1: 10% dataset
    1. Gaussian Discriminant Analaysis/Logistic Regression
    Test Set Accuracy for Gaussian Discriminant Analysis:  22.115384615384613 %
    Test Set Accuracy for Logistic regression:  22.115384615384613 %
 
         We get exactly the same result for 10% of the dataset.
    
    2. Naive Bayes/Logistic Regression
      Test Set Accuracy for Naive Bayes:  29.807692307692307 %
      Test Set Accuracy for Logistic regression:  22.115384615384613 %
      
         Naive Bayes perform better for 10 %

## 2.    30 % of the dataset

In [848]:
# 30 % of the dataset and split into train and test set
data2,label2=divid_dataset(X1,y,0.3)
train_data2,test_data2,train_labels2,test_labels2=train_test_split(data2,label2,shuffle=True,test_size=0.25,random_state=42,stratify=label2)

Naive Bayes:

In [884]:
data21,label21=divid_dataset(X,y,0.3)
train_data21,test_data21,train_labels21,test_labels21=train_test_split(data21,label21,shuffle=True,test_size=0.25,random_state=42,stratify=label21)
classes21=np.unique(train_labels21)
model = NaiveBayes(classes21)
model.train(train_data21,train_labels21)
predicted = model.predict(test_data21)
#accuracy
nb21_test_acc=np.sum(predicted==test_labels21)/float(len(test_labels21)) 
print ("Test Set Accuracy for Naive Bayes: ",nb21_test_acc*100,"%") 

Test Set Accuracy for Naive Bayes:  26.04501607717042 %


Gaussian Discriminant Analysis

In [885]:
model = GDA()
model.fit(train_data2.values,train_labels2)
predicted = model.predict(test_data2)
#accuracy
gda2_test_acc=np.sum(predicted==test_labels2)/float(len(test_labels2)) 
print ("Test Set Accuracy for Gaussian Discriminant Analysis: ",gda2_test_acc*100,"%") 

Test Set Accuracy for Gaussian Discriminant Analysis:  24.115755627009648 %


Logistic Regression

In [886]:
label22 = (np.arange(len(np.unique(label2))) == label2[:, None]).astype(float)
train_data22,test_data22,train_labels22,test_labels22=train_test_split(data2,label22,shuffle=True,test_size=0.25,random_state=42,stratify=label22)

In [887]:
classifier = LogisticRegressionClass()
classifier.max_epochs=100
w_predicted = classifier.MultinomialRegression_fit(train_data22.values,train_labels22)
y_hat = classifier.MultinomialRegression_predict(test_data22.values)
#accuracy
lr2_test_acc=classifier.evaluate(test_data22.values,test_labels22)
print ("Test Set Accuracy for Logistic regression: ",lr2_test_acc.mean()*100,"%") 

Test Set Accuracy for Logistic regression:  24.115755627009648 %


Comparison2: 30 % dataset
    1. Gaussian Discriminant Analaysis/Logistic Regression
    Test Set Accuracy for Gaussian Discriminant Analysis:  24.115755627009648 %
    Test Set Accuracy for Logistic regression:  24.115755627009648 %
    
     We get almost the same result between GDA and Logistic Regression
    
    2. Naive Bayes/Logistic Regression
      Test Set Accuracy for Naive Bayes:  26.04501607717042 %
      Test Set Accuracy for Logistic regression:  24.115755627009648 %
      
      Naive bayes perform better but his performence decrease when the size of   the dataset increase

## 3.      60 % of the dataset

In [888]:
# 60 % of the dataset and split into train and test set
data3,label3=divid_dataset(X1,y,0.6)
train_data3,test_data3,train_labels3,test_labels3=train_test_split(data3,label3,shuffle=True,test_size=0.25,random_state=42,stratify=label3)

Naive Bayes

In [898]:
data31,label31=divid_dataset(X,y,0.6)
train_data31,test_data31,train_labels31,test_labels31=train_test_split(data31,label31,shuffle=True,test_size=0.25,random_state=42,stratify=label31)
classes31=np.unique(train_labels31)
model = NaiveBayes(classes31)
model.train(train_data31,train_labels31)
predicted = model.predict(test_data31)
#accuracy
nb31_test_acc=np.sum(predicted==test_labels31)/float(len(test_labels31)) 
print ("Test Set Accuracy for Naive Bayes: ",nb31_test_acc*100,"%") 

Test Set Accuracy for Naive Bayes:  24.27652733118971 %


Gaussian Discriminant Analysis

In [899]:
model = GDA()
model.fit(train_data3.values,train_labels3)
predicted = model.predict(test_data3)
#accuracy
gda3_test_acc=np.sum(predicted==test_labels3)/float(len(test_labels3)) 
print ("Test Set Accuracy for Gaussian Discriminant Analysis: ",gda3_test_acc*100,"%") 

Test Set Accuracy for Gaussian Discriminant Analysis:  24.115755627009648 %


Logistic Regression

In [900]:
label33 = (np.arange(len(np.unique(label3))) == label3[:, None]).astype(float)
train_data33,test_data33,train_labels33,test_labels33=train_test_split(data3,label33,shuffle=True,test_size=0.25,random_state=42,stratify=label33)

In [901]:
classifier = LogisticRegressionClass()
classifier.max_epochs=100
w_predicted = classifier.MultinomialRegression_fit(train_data33.values,train_labels33)
y_hat = classifier.MultinomialRegression_predict(test_data33.values)
#accuracy
lr3_test_acc=classifier.evaluate(test_data33.values,test_labels33)
print ("Test Set Accuracy for Logistic regression: ",lr3_test_acc.mean()*100,"%") 

Test Set Accuracy for Logistic regression:  24.115755627009648 %


Comparison3: 60 % dataset
    1. Gaussian Discriminant Analaysis/Logistic Regression
    Test Set Accuracy for Gaussian Discriminant Analysis:  23.15112540192926 %
    Test Set Accuracy for Logistic regression:  24.115755627009648 %
    
    Logistic Regression obtain legerely better result than GDA
    
    2. Naive Bayes/Logistic Regression
      Test Set Accuracy for Naive Bayes:  24.27652733118971 %
      Test Set Accuracy for Logistic regression:  24.115755627009648 %
      
     Naive Bayes and Logistic regression have almost the same result but the performane of the Naive Bayes decrease on 60 % on the datatset whereas Logistic Regression's score is improving. 

## 4.      100 % of the dataset

In [902]:
# 100 % of the dataset and split into train and test set
data4,label4=divid_dataset(X1,y,1)
train_data4,test_data4,train_labels4,test_labels4=train_test_split(data4,label4,shuffle=True,test_size=0.25,random_state=42,stratify=label4)

Naive Bayes

In [904]:
data41,label41=divid_dataset(X,y,1)
train_data41,test_data41,train_labels41,test_labels41=train_test_split(data41,label41,shuffle=True,test_size=0.25,random_state=42,stratify=label41)
classes41=np.unique(train_labels41)
model = NaiveBayes(classes41)
model.train(train_data41,train_labels41)
predicted = model.predict(test_data41)
#accuracy
nb41_test_acc=np.sum(predicted==test_labels41)/float(len(test_labels41)) 
print ("Test Set Accuracy for Naive Bayes: ",nb41_test_acc*100,"%") 

Test Set Accuracy for Naive Bayes:  21.428571428571427 %


Gaussian Discriminant Analysis

In [905]:
model = GDA()
model.fit(train_data4.values,train_labels4)
predicted = model.predict(test_data4)
#accuracy
gda4_test_acc=np.sum(predicted==test_labels4)/float(len(test_labels4)) 
print ("Test Set Accuracy for Gaussian Discriminant Analysis: ",gda4_test_acc*100,"%") 

Test Set Accuracy for Gaussian Discriminant Analysis:  23.35907335907336 %


Logistic Regression

In [906]:
label44 = (np.arange(len(np.unique(label4))) == label4[:, None]).astype(float)
train_data44,test_data44,train_labels44,test_labels44=train_test_split(data4,label44,shuffle=True,test_size=0.25,random_state=42,stratify=label44)

In [907]:
classifier = LogisticRegressionClass()
classifier.max_epochs=100
w_predicted = classifier.MultinomialRegression_fit(train_data44.values,train_labels44)
y_hat = classifier.MultinomialRegression_predict(test_data44.values)
#accuracy
lr4_test_acc=classifier.evaluate(test_data44.values,test_labels44)
print ("Test Set Accuracy for Logistic regression: ",lr4_test_acc.mean()*100,"%") 

Test Set Accuracy for Logistic regression:  23.35907335907336 %


Comparison4: 100 % dataset
    1. Gaussian Discriminant Analaysis/Logistic Regression
    Test Set Accuracy for Gaussian Discriminant Analysis:  23.35907335907336 %
    Test Set Accuracy for Logistic regression:  23.35907335907336 %
    
    when we use all the dataset Logistic Regression and GDA still have almost the same performence.
    
    2. Naive Bayes/Logistic Regression
      Test Set Accuracy for Naive Bayes:  21.428571428571427 %
      Test Set Accuracy for Logistic regression:  23.35907335907336 %
      
      with all the dataset Logistic Regression get better score than Naive Bayes

## General Conclusion

Logistic Regression, Naive Bayes and Gaussiant Discriminant Analysis are all used for classification problems. Indeed the difference is about their leearning mechanism: 
Naive Bayes and Gaussian Discriminant Analysis are generative model(Bayes Rules and join the distrinution of the features to the target) and Logistic Regression is discriminative model(learning the input to output mapping by minimising the error). 
Durant our comparison we saw that in general, both logistic regression and discriminant analyses converged in similar results even if we change the size of the dataset.
Indeed, When the size of the dataset small, Naive Bayes have better performance than Logistic Regression. So Naive Bayes make supposition that all features are lineary independant. That's why it performe well with small size of data but if the size increase, his performence decrease that's to complexy of the dataset. On the other hand, Logistic regression predict well when the dataset is large enough but if the size of the dataset is small relative to the number of features, including regularisation such as Lasso and Ridge regression can help reduce overfitting and result in a more generalised model. 