In [None]:
'''
The adult dataset was used to study a method of handling continuous data. The approach to handling continuous column data
is using the Naive Bayes classifier algorithm. The code is to manually handle continuous columns and calculate probability
using the Gaussian formula. The adult dataset contains both categorical and continuous types of columns.The accuracy score will
be calculated on only continuous columns. To implement this we will use the Naive Bayes classifier algorithm.
Naive Bayes algorithm is based on conditional probability calculation.

Data is read using pandas function and converted into a data frame.The data is preprocessed to replace the '?' values of the 
columns with the mode values of those columns. Thereafter the categorical columns are separated from continuous columns.
The continuous columns are then train-test split to get input(X) and output/label(Y) columns.

The probabilities of each unique class are calculated. 
For every column of training data given label ,std and mean are calculated and stored in the dictionary.
eg {'Age': {'StdYes': 0.0,'MeanYes': 10.0,......etc
This way a training model is ready.

Validation data is run over this training model for prediction. Conditional probability (using gaussian formula) for every
column value given class  (column|unique class) for a given row is calculated.This way all column|class probabilities are
multiplied to get a cumulative probability for that row. The cumulative value is then multiplied with the probability of that
class. This process is repeated for all unique values of Y data. Thus every row will have as many probabilities
as a unique class in Y data. The probability of the given class for that row (eg Yes:81, No:76) is stored in a dictionary.
The maximum probability for a given class will be the prediction for that row.


The accuracy score is calculated with predicted data and actual class data.



Gaussian Formula :-
eg : - Stolen = Y,N(class)
       Age = Column value(Input)

(P(Stolen=Y|Age)) = P(Stolen=Y) * P(Stolen=y ^ Age)
                                                                       
P(x|y)  1-(exp(row values-column mean)**2/(2*(std**2 of stolen=y)     x= row value 
        _____________________________________
        sqrt(2*pie*(std**2 of stolen=y))

    a.The above calculation is then multiplied by P(Stolen=Y)  to complete P(Stolen=Y) * P(Stolen=y ^ Age) 
    conditional probability expression of Naive Bayes.
    b.Similar probability calculation is done for Stolen=N.
    c.Maximum of both the value ie Stolen=Y and Stolen=Y is considered as a prediction for that row.



'''

In [1]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from IPython.display import display
import math
pd.set_option('display.max_columns', None)


In [2]:
def read_data():
    
    '''
    This function is to read data using Pandas read_csv function and convert into dataframe.
    
      
    Return :- 
    data :- Dataframe of adult data set
    '''
    
    data=pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data',names=['Age','Workclass','FNLWGT','Education','Education-Num','Marital Status','Occupation','Relationship','Race','Sex','Capital Gain','Caplital Loss','Hrs-Per-Week','Native-Country','Sal'])
    
    return data



def format_data(df):
    
    '''
    The function is for data preprocessing. The '?' values of the columns are replaced with the mode value of those columns.
    
    The categorical columns of the data frame are dropped.
    
        
    Arguemnt :- 
    df :- Adult datafram
    
    Return :- 
    df :- Modified dataframe.
    '''
    
    for col in df.columns:

        df[col].replace(' ?',df[col].mode()[0],inplace=True)
    
    df.drop(columns=['Workclass','Education','Marital Status','Occupation','Relationship','Race','Sex','Native-Country'],inplace=True)
    
    return df


def split_data(dfnew):
    
    '''
    The function is to split data first into the Input (X) and Output (Y) dataset. The X data is further train-test split into
     x_train,y_train,x_test,y_test data.
    
    Argument :-
    dfnew :-Modified adult dataframe
    
    Return :- 
    x_train:-training data
    x_test :- Validation data
    y_train :- Label data of training data
    y_test :- label data of validation data
    
    '''
    

    x=dfnew.iloc[:,:-1]
    y=dfnew.iloc[:,-1]
    x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=.3, random_state=0)
    
    return x_train,x_test,y_train,y_test




def output_uniq(y_train):
    
    '''
     This function is to calculate the probability of unique values of the Output data.
    Argument :- 
    y_train :- Ytrain data to get probabilty of output columns
    
    Return :-  
    ytrain_unq_dict :-Dictionary holding probabilty of each unique value of output data.
    '''
    
    row_cnt=y_train.shape[0]
    unq_ytrain=y_train.unique()
    ytrain_unq_dict={}
    ytrain_prob=0

    for k in unq_ytrain:
        ytrain_prob=y_train[y_train==k].shape[0]/row_cnt
        ytrain_unq_dict.update({k:ytrain_prob})
        
    return ytrain_unq_dict




def std_mean(x_train,y_train):
    
    '''
The function is to calculate std and mean of columns of training data. The training data is iterated over the columns.
The conditional probability is calculated for every column|unique class.
Both std and mean are calculated this way and stored in a dictionary object.
eg {'Age': {'StdYes': 0.0,'MeanYes': 10.0,......etc
    
    
    Argument :- 
    x_train :- Training data 
    y_train :- Y data
    
    Return :- 
    stdmean   :- Dictionary containing std and mean.
    '''

    concatdf=pd.concat([x_train,y_train],axis=1) #Concatenate x_train and y_train to calcualte 
    unq_ytrain=y_train.unique()
    stdmean={} #Dictionary object to hold std and mean for every column's unique value|class value

    for col in x_train.columns:
        newdict={}
    
        for unq in unq_ytrain:
        
            col_std=concatdf[concatdf[y_train.name]==unq][col].std()
            newdict.update({'Std'+unq:col_std})
            col_mean=concatdf[concatdf[y_train.name]==unq][col].mean()
            newdict.update({'Mean'+unq:col_mean})    

    
        stdmean.update({col:newdict})
    return stdmean





def finalypred(x_test,train_std_mean,y_train,ytrain_unq_dict):
    
    
    '''
    
    In this function the validation data is run over this training model for prediction.
    Conditional probability (using gaussian formula) for every column value given class for a given row is calculated. This way
    all column|class probabilities are multiplied to get a cumulative probability for that row. The cumulative value is then
    multiplied with the probability of that class. This process is repeated for all unique values of Y data. Thus every row will
    have as many probabilities as a unique class in Y data. The probability of the given class for that row (eg Yes:81, No:76) 
    is stored in a dictionary.
    The maximum probability for a given class will be the prediction for that row.
    
    
    
    Argument:- 
    x_test           :- Validation data
    train_std_mean    :- dictionary holding std and mean for unique value of the training data columns.
    ytrain            :- Output columnn data
    ytrain_unq_dict   :- Dictionary holding probabilties of unqiue class of output data.
    
    Return :- 
    Ypred            :- List of prediction for every row of validation data.
    
    
    '''
    
    d={}
    xtest_prob={} #Dictionary to hold sum probabilty of the column|class value for a given row. 
    prob=1
    Ypred=[]  # List to hold the prediction for every row.
    unq_ytrain=y_train.unique()
    

# Iterating every row in x_test data
    for index,row in x_test.iterrows():
        
        # Iterating through unqiue values of Output column
        for val in unq_ytrain:
            prob=1
            
            #Iterating through unqiue columns of input column.Cal Prob for every row value for that column cal prob eg RednN,sportnN,domestinN.calc probablity
            for col in x_test.columns:
                pstd=0
                pmean=0
                denom=0
                num=0
                d=train_std_mean[col]
                
                pstd=d['Std'+val]
                
                pstd=pstd**2
                
                pmean=d['Mean'+val]
                
                denom=math.sqrt(2*math.pi*pstd)
                
                num=math.exp(-(row.get(col)-pmean)**2/(2*pstd))
                
                p=num/denom
                
                prob=prob*p
                
            
            prob=prob*ytrain_unq_dict[val]  #P(Stolen=Y) * P(Stolen=y ^ Age) 
            xtest_prob.update({val:prob})
        max_v = max(zip(xtest_prob.values(), xtest_prob.keys()))  # find max values between Yes and N of the same row
    
        Ypred.append(max_v[1])
      
    return Ypred




def accuracyfunc(ypredect,y_test):
    
    '''
    The function is to calculate the accuracy score with predicted data and actual labels.
    
    Argument :- 
    ypredect  :-List of prediction for every row of validation data.
    y_test    :- Actual class for every row of validation data.
    
    Return :- 
    accuracy  :- Accuracy score of the algorithm
    '''
    
    ytest=y_test.tolist()

    ytestrow=len(ytest)

    cntrow=0
    accuracy=0
    
    for x,y in zip(ytest,ypredect):
        
        if x==y:
            cntrow=cntrow+1
  
    accuracy=cntrow*100/ytestrow
    return accuracy




In [3]:
#Read data into pandas data frame
df=read_data()

In [4]:
#Replace '?' values of all the columns
df=format_data(df)

In [5]:
#Train-test split the X and Y data.
xtrain,xtest,ytrain,ytest=split_data(df)

In [6]:
#Calculate the probabilities of unique values of Y data.
output_prob=output_uniq(ytrain)

In [7]:
#Calculate the std and mean of every column of the training data.
std_mean_val=std_mean(xtrain,ytrain)

In [8]:
#Predict the test data.
ypredect=finalypred(xtest,std_mean_val,ytrain,output_prob)

In [9]:
#Find the accuracy score.
accuracy_val=accuracyfunc(ypredect,ytest)

In [10]:
accuracy_val

79.76251407513563