In [None]:
'''
The Car data set used in the code to see how we can handle both categorical and continuous data. Both types of data are handled
differently. After reading the data in the data frame, data is split in X(input) and Y(output/label)data. The X data and Y data
are then train-tet split to get training and testing data. Till now training and testing both have categorical and continuous 
data. We now need to separate categorical and continuous data from training data and store them in a different data frame.


For categorical data, we calculate the conditional probability. Training data is iterated over columns. For every unique value
of column conditional probability is calculated with every unique value of the Y data label
eg For column Color there are 2 unique value Red, Yellow.Y data as 2 unique value Yes and No.We calculate the conditional
probability of Red|No,Red|Yes,Yellow|No,Yellow|Yes etc. 
This data is stored in an dictionary object against every column ({'Color': {'YellowYes': 0.5,
  'YellowNo': 0.57,'RedYes': 0.5,........etc)

For continuous data, training data is iterated over the column. The conditional probability is calculated for every 
column|unique class.
Both std and mean are calculated this way for every column|unique class and stored in a dictionary object.
eg {'Age': {'StdYes': 0.0,'MeanYes': 10.0,......etc

For Y data, probabilty is calculated for every unique value and stored in dictionary.eg Yes:80,No:76 etc..

The model is now ready to predict the test data. The test data is iterated over row. For every row, the corresponding columns
are matched with continuous and categorical training data frames. If the columns of the given row belong to continuous type
then respective column|class probability is obtained from the dictionary object(Check paragraph 3). For the categorical column,
the probability is obtained from the dictionary object(Check paragraph 2). The probability of each column |class is multiplied to
get a cumulative probability for a given row. The same process is repeated for all unique class. The class|probabilty is stored
in the dictionary object. The class having maximum probability is will be the prediction of the given row. A list is populated with
prediction got this way for every row of testing data.

The prediction got for every row from above along with actual class is used to calculate the accuracy score.




'''


In [21]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from IPython.display import display
import math




In [57]:
def read_data():
    
    '''
    This function is to read data using Pandas read_csv function and convert into dataframe.
    
      
    Return :- Dataframe
    '''
    
    data=pd.read_csv('D:\Dataset\Car-dataset-mixed.csv')
    
    for col in data.columns:

        data[col].replace(' ?',data[col].mode()[0],inplace=True)
        
        return data
    
    

In [58]:
def preprocess_data(df):
    
    '''
    The function is to split data first into the Input (X) and Output (Y) dataset. The X data is further train-test split into
     x_train,y_train,x_test,y_test data.
    
    Argument :-
    df :-Modified adult dataframe
    
    Return :- 
    xtrain:-training data
    xtest :- Validation data
    ytrain :- Label data of training data
    ytest :- label data of validation data
    
    '''
    
    
    x=df.iloc[:,:-1]
    y=df.iloc[:,-1]
    
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.3,random_state=0)
    
    return xtrain,xtest,ytrain,ytest

In [59]:
def train_cat_cont_split(x_train,y_train):
    '''
    Split the train data into catogorical and continous data.
    Concatenate ytrain data to above created new data frame.
    
    
    Argument :- 
    x_train :- Training data.
    y_train :- Y data of training data set.
    
    Return :- 
    x_cont_df :- xtrain continous data
    x_cat_df :- Data frame containing categorical data of training data set and respective Y data.
    x_cont_data :- Data frame containing continuous columns of training data set
    x_cat_dat :- Data frame containing categorical columns of training data set.
    
    '''
    
    #split x_train data into continuous data and concatenate with y_train.Converting series to a data frame.
    x_cont_data=x_train.iloc[:,0]
    x_cont_df=pd.concat([x_cont_data,y_train],axis=1)
    x_cont_data=x_cont_data.to_frame()
    
    
    #split x_train data into categorical data and concatenate with y_train.

    x_cat_data=x_train.iloc[:,1:4]
    x_cat_df=pd.concat([x_cat_data,y_train],axis=1)
    
    return x_cont_df,x_cat_df,x_cont_data,x_cat_data
    
 

In [60]:
def class_prob(y_train):
    
    '''
    This function is to calculate the probability of unique values of the Output data.
    
    Argument :- 
    y_train :- Y train data.
    
    Return :-  
    outdict :-Dictionary holding probabilty of each unique value of output data.
    unq_out :- Unique value of Y data.
    '''
    
    #Find unique value of Output column and rowcount of y_train data 
    unq_out=y_train.unique()
    rowcnt=y_train.shape[0]
    outdict={}
    
    
    # Getting probablity of ouput column for unqiue values
    for k in unq_out:
        prob=y_train[y_train==k].shape[0]/rowcnt
        outdict.update({k:prob})

    return outdict,unq_out

In [61]:
def std_mean_cal(xtrain_cont_data,xtrain_cont_df,y_train):
    
    '''
For continous data,training data is iterated over column.The conditional probability is calculated for every column|unique class
Both std and mean are calculated this way and stored in an dictionary object.
eg {'Age': {'StdYes': 0.0,'MeanYes': 10.0,......etc
   
    
    Argument :- 
    xtrain_cont_data :- Data frame with continuous columns of training data set
    xtran_cont_df    :- Data frame with continuous columns and respective Y data.
    y_train :- Y data
    
    Return :- 
    train_cont_std_mean   :- Dictionary containing std and mean.
    '''
    
    
    #Calculate manually std and mean for every continuous column data of training data set and store in a dictionary against 
    #each column name.

    unq_ytrain=y_train.unique()
    train_cont_std_mean={}

    for col in xtrain_cont_data.columns:
        newdict={}
    
        for unq in unq_ytrain:
        
            col_std=xtrain_cont_df[xtrain_cont_df[y_train.name]==unq][col].std()
            newdict.update({'Std'+unq:col_std})
            col_mean=xtrain_cont_df[xtrain_cont_df[y_train.name]==unq][col].mean()
            newdict.update({'Mean'+unq:col_mean})
    
    
        train_cont_std_mean.update({col:newdict})
    return train_cont_std_mean


In [62]:
def train_cat_prob(xtrain_cat_data,y_train,xtrain_cat_df):
    '''
    This function is to calculate the probability of categorical data of training eg Red nYes,Red nNo etc.
    The conditional probability is calculated from every unique value of column | unique class (Red|yes.Red|No..etc).The Laplace
    smoothing is implemented to avoid 0 probability.
    This data is stored in an dictionary object against every column ({'Color': {'YellowYes': 0.5,'YellowNo': 0.57,'RedYes': 0.5,..
    ....etc)
      
    
    Argument :-
    xtrain_cat_data :- Data frame containing categorical columns of training data set
    y_train         :- Y train data.
    xtrain_cat_df   :- Data frame containing categorical columns of training data set and respective Y data.
    
    Return :- 
    xcat_prob  :- dictionary holding probabilty:{RedNyes:prob} 
    
    '''
 
    #Finding probablity of categorical data with output column
    xcat_prob={}
    unq_ytrain=y_train.unique()
    for col in xtrain_cat_data.columns:
            colunq=xtrain_cat_data[col].unique()
    
            newdict={}
    
            for val in colunq:
     
                for val1 in unq_ytrain:
            
                    result = (xtrain_cat_df[(xtrain_cat_df[col]==val) & (xtrain_cat_df[y_train.name]==val1)].shape[0]+1)
                    prob=result/(xtrain_cat_df[xtrain_cat_df[y_train.name]==val1].shape[0]+len(unq_ytrain))
            
                    newdict.update({val+val1:prob})
        
            xcat_prob.update({col:newdict})
    return xcat_prob
 

In [63]:
def predict_test(x_test,y_train,train_std_mean,xtrain_cat_prob,xtrain_cont_data,xtrain_cat_data):
    
    '''
    The test data is iterated over row. For every row, the corresponding columns are matched with continuous and categorical 
    training data frames.If the columns of the given row belong to continuous type then respective column|class probability is
    obtained from the dictionary object. For the categorical column,probability is obtained from the dictionary object obtained
    above.The probability of each column |class is multiplied to get a cumulative probability for a given row.
    The same process is repeated for all unique class. The class|probabilty is stored in the dictionary object.The class having
    maximum probability is predicted for that row. 
    
    
    Argument:- 
    x_test           :- Validation data
    y_train           :- Output columnn data
    train_std_mean    :- dictionary holding std and mean for unique value of the training data columns.
    xtrain_cat_prob   :- Dictionary holding probabilty of categorical column.
    xtrain_cont_data  :- Data frame of columns with continuous data and respective Y data
    xtrain_cat_data   :- Data frame of columns with categorical data and respective Y data.
    
    
    Return :- 
    Ypred            :- List of prediction for every row of validation data.
    
    
    '''
    
    p=0
    d={}
    xtest_prob={}
    unq_ytrain=y_train.unique()
    Ypred=[]
    val=''
    
    #Iterating every row in x_test data
    for index,row in x_test.iterrows():
    
        for val in unq_ytrain:
            prob=1
        
                
            #for every col in x_test rows:
            for ind in row.index:
                
                 
                 # Check whether that row exist in training contionus data or in training catogorical data.   
                if ind in(xtrain_cont_data.columns):
                
                
                    pstd=0
                    pmean=0
                    denom=0
                    num=0
                    d=train_std_mean[ind]
                
                    pstd=d['Std'+val]
                
                    pstd=pstd**2
                
                    pmean=d['Mean'+val]
                    
                    if pstd ==0 :
                        pstd=1
                
                
                    denom=math.sqrt(2*math.pi*pstd)
                
                    num=math.exp(-(row.get(ind)-pmean)**2/(2*pstd))
                   
                    p=num/denom
                    prob=prob*p
            
                if ind in(xtrain_cat_data.columns):
                    
                
                    d=xtrain_cat_prob[ind]
                    
                    
                    
                    if row.get(ind)+val in d.keys() :
                        p=d[row.get(ind)+val]
                        
                    
                        prob=prob*p
                    
                #Multipling all columns probabilty(both continious and catogorical data P(A1)*P(A2)*P(A3)……P(An)),
                #to get final row wise probabilty.
               
            xtest_prob.update({val:prob})
           
       
        #Get the class with highest probability.This will be prediction of the given row.
        
        
        max_v = max(zip(xtest_prob.values(), xtest_prob.keys()))
        
        Ypred.append(max_v[1])
    
    return Ypred
                
  

In [64]:
def accuracyfunc(ypredect,ytest):
    
    '''
    The function is to calculate the accuracy score with predicted data and actual labels.
    
    Argument :- 
    ypredect  :-List of prediction for every row of validation data.
    y_test    :- Actual class for every row of validation data.
    
    Return :- 
    accuracy  :- Accuracy score of the algorithm
    '''
    
    ytest=ytest.tolist()

    ytestrow=len(ytest)

    cntrow=0
    accuracy=0
    
    for x,y in zip(ytest,ypredect):
        
        if x==y:
            cntrow=cntrow+1
  
    accuracy=cntrow*100/ytestrow
    return accuracy

In [65]:
# Read data into pandas data frame
df=read_data()

In [66]:
#Train-test split the X and Y data.
x_train,x_test,y_train,y_test=preprocess_data(df)

In [67]:
#Separate out continuous and categorical data of training data set.
xtrain_cont_df,xtrain_cat_df,xtrain_cont_data,xtrain_cat_data=train_cat_cont_split(x_train,y_train)

In [68]:
#Calculate probability of unqiue values of Y data.
class_dict,unqout=class_prob(y_train)

In [69]:
#Calculate std and mean of continuous columns of training data set
train_std_mean=std_mean_cal(xtrain_cont_data,xtrain_cont_df,y_train)

In [70]:
#Calculate probability of every unique value of categorical columns of training data set given unique value of Y data(Red|Yes)
xtrain_cat_prob=train_cat_prob(xtrain_cat_data,y_train,xtrain_cat_df)

In [71]:
#Predict the test data.
ypred=predict_test(x_test,y_train,train_std_mean,xtrain_cat_prob,xtrain_cont_data,xtrain_cat_data)

In [72]:
#Find out the accuracy score.
accuracy_val=accuracyfunc(ypred,y_test)

In [73]:
accuracy_val

25.0