In [None]:
'''
The Adult data set used in code to see how we can handle both categorical and continuous data. Both types of data are handled
differently. After reading the data in the data frame, data is split in X(input) and Y(output/label)data. The X data and Y data
are then train-test split to get training and testing data. Till now training and testing both have categorical and continuous 
data. We now need to separate categorical and continuous data from training data and store them in a different data frame.


For categorical data, we calculate the conditional probability. Training data is iterated over columns. For every unique value of column conditional probability is calculated with every unique value of the Y data label
eg For column Color there are 2 unique value Red, Yellow.Y data as 2 unique value Yes and No.We calculate the conditional
probability of Red|No,Red|Yes,Yellow|No,Yellow|Yes etc. 
This data is stored in an dictionary object against every column ({'Color': {'YellowYes': 0.5,
  'YellowNo': 0.57,'RedYes': 0.5,........etc)

For continuous data, training data is fit on GaussianNB object. Std and mean are calculated for every column. The column values
are then scaled to Z -score using the Z-score formula. Training data columns are transformed into Z-score after this process. A model is ready to predict 
continuous data. On the same object, test data is predicted. For every unique value of Y data, test data is predicted. The prediction
of test data for every class is populated in a data frame.


For Y data, probabilty is calculated for every unique value and stored in dictionary.eg Yes:80,No:76 etc..

The model is now ready to predict the test data. The test data is iterated over row. For every row, the corresponding columns
are matched with continuous and categorical training data frames. If the columns of the given row belong to continuous type
then respective column|class probability is obtained from the dictionary object(Check paragraph 3). For the categorical column,
the probability is obtained from the dictionary object(Check paragraph 2). The probability of each column |class is multiplied to
get a cumulative probability for a given row. The same process is repeated for all unique class. The class|probabilty is stored
in the dictionary object. The class having maximum probability is will be the prediction of the given row. A list is populated with
prediction got this way for every row of testing data.

The prediction got for every row from above along with actual class is used to calculate the accuracy score.

'''


In [2]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from IPython.display import display
import math




In [3]:
def read_data():
    
    
    '''
    This function is to read data using Pandas read_csv function and convert into dataframe.
    
      
    Return :- Dataframe
    '''
    
    data=pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data',names=['Age','Workclass','FNLWGT','Education','Education-Num','Marital Status','Occupation','Relationship','Race','Sex','Capital Gain','Caplital Loss','Hrs-Per-Week','Native-Country','Sal'])
    
    for col in data.columns:

        data[col].replace(' ?',data[col].mode()[0],inplace=True)
        
    return data
    
    

In [4]:
def preprocess_data(df):
    
    '''
    The function is to split data first into the Input (X) and Output (Y) dataset. The X data is further train-test split into
     x_train,y_train,x_test,y_test data.
    
    Argument :-
    dfnew :-Modified adult dataframe
    
    Return :- 
    xtrain:-training data
    xtest :- Validation data
    ytrain :- Label data of training data
    ytest :- label data of validation data
    
    '''
    
    
    x=df.iloc[:,:-1]
    y=df.iloc[:,-1]
    
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.3,random_state=0)
    
    return xtrain,xtest,ytrain,ytest

In [5]:
def train_cat_cont_split(x_train,y_train,x_test,y_test):
    '''
    The function is to separate continuous and categorical columns of training and test data into different data frames.
    The class of respective training and test data are concatenated with the above data frame for further use.
    
    
    Argument :- 
    x_train  :- Training data
    y_train   :- class of training data
    x_test    :- Testing data
    y_test    :- class of testing data
    
    Return :- 
    x_cont_df  :- Continuous columns of training data concatenated with x_test
    x_cat_df   :- Categorical columns of training data concatenated with x_test
    x_cont_data :-Continuous columns of training data
    x_cat_data :- Categorical columns of training data
    x_test_cont_data :-Continuous columns of test data
    x_test_cat_data :-Categorical columns of test data
    x_test_cat_df  :-Categorical columns of test data concatenated with y_test
    
      
    '''
    
    #split x_train data into continuous data and concatenate with y_train to find the probability.Converting series to a data frame.
    x_cont_data=x_train[['Age','FNLWGT','Education-Num','Capital Gain','Caplital Loss','Hrs-Per-Week']]
    x_cont_df=pd.concat([x_cont_data,y_train],axis=1)
   
    
    
    #split x_train data into categorical data and concatenate with y_train.
    x_cat_data=x_train[['Workclass','Education','Marital Status','Occupation','Relationship','Race','Sex','Native-Country']]
    x_cat_df=pd.concat([x_cat_data,y_train],axis=1)
    
    #split x_test data into continuous data
    x_test_cont_data=x_test[['Age','FNLWGT','Education-Num','Capital Gain','Caplital Loss','Hrs-Per-Week']]
    
    
    #split x_test data into categorical data

    x_test_cat_data=x_test[['Workclass','Education','Marital Status','Occupation','Relationship','Race','Sex','Native-Country']]
    x_test_cat_df=pd.concat([x_test_cat_data,y_test],axis=1)
    
    
    return x_cont_df,x_cat_df,x_cont_data,x_cat_data,x_test_cont_data,x_test_cat_data,x_test_cat_df
    
 

In [6]:
def class_prob(y_train):
    
    '''
    This function is to calculate the probability of unique values of the Output data.
    
    Argument :- 
    y_train :- Ytrain data to get probabilty of output columns
    
    Return :-  
    outdict :-Dictionary holding probabilty of each unique value of output data.
    unq_out :- Unique value of Y data.
    '''
    
    #Find unique value of Output column and rowcount of y_train data 
    unq_out=y_train.unique()
    rowcnt=y_train.shape[0]
    outdict={}
    
    
    # Getting probablity of ouput column for unqiue values
    for k in unq_out:
        prob=y_train[y_train==k].shape[0]/rowcnt
        outdict.update({k:prob})

    return outdict,unq_out

In [7]:
def Gaussian_std_mean(xtrain_cont_data,y_train,xtest_cont_data):
    '''
    For continuous data, training data is fit on GaussianNB object. Std and mean are calculated for every column. The column values
    are then scaled to Z -score using the Z-score formula. Training data columns are transformed into Z-score after this process
    A model is ready to predict continuous data. On the same object, test data is predicted. For every unique value of Y data,
    test data is predicted. The prediction of test data for every class is populated in a data frame.
    
    Argument :- 
    xtrain_cont_data :-Continuous data of training data set
    y_train          :- class of training data
    xtest_cont_data  :-Continuous data of test data
    
    
    Return :-
    xtrain_cont_data :- Transformed continuos data of training data set
    x_test_cont_df :- Data frame with only class prediction for continuous data of test data set.
    
    '''
    # Creating object of gaussianNB calss
    clf = GaussianNB()
    
    #This is to calculate sigma and Mu of every column of x_train
    clf=clf.fit(xtrain_cont_data,y_train)
    
    # On the same object ,fit the test data on training.Create a df with zcore values.
    x_test_cont_df=pd.DataFrame(clf.predict_proba(xtest_cont_data), columns=clf.classes_)
    
    return xtrain_cont_data,x_test_cont_df
    


In [8]:
def train_cat_prob(xtrain_cat_data,y_train,xtrain_cat_df):
    '''
    Calculate probability of training catogorical data eg Red nYes,Red nNo etc
    Populate dictionary with column and probabilty of unique values of column n class label unique values.
    Laplace smoothing in taken in to consideration for zero values in columns.
    
    
    Argument :- training catogorical data,ytrain data
    
    Return :- dictionary holding column:{RedNyes:prob} 
    
    '''
 
    #Finding probablity of categorical data with output column
    xcat_prob={}
    unq_ytrain=y_train.unique()
    for col in xtrain_cat_data.columns:
            colunq=xtrain_cat_data[col].unique()
    
            newdict={}
    
            for val in colunq:
     
                for val1 in unq_ytrain:
            
                    result = (xtrain_cat_df[(xtrain_cat_df[col]==val) & (xtrain_cat_df[y_train.name]==val1)].shape[0]+1)
                    prob=result/(xtrain_cat_df[xtrain_cat_df[y_train.name]==val1].shape[0]+len(unq_ytrain))
            
                    newdict.update({val+val1:prob})
        
            xcat_prob.update({col:newdict})
    return xcat_prob
 

In [9]:
def predict_test(xtestcat_cont,xtest_cont_df,xtest_cat_data,xtrain_cat_prob,y_train):
    
    '''
    Predict test data both catogorical and continious data against.
    Iterate every row of test data.For every row check the column if catogorical use training catgorical probablity,
    if column of the row is continous data,use test continous probabilty df.
    Need to multiple every column probabilty for a given row.
    Each row will have 2 probabilty or more based on nos of unique values in class.Update the class lable:prob in a dictinary.
    Before the start of new row ,get the max probabilty and store the corresponding class lable in a dict
    The class value predict for a given row ,at the end of test data is used to calculate Accuracy score.
    
    
    Argument :- test cat and cont prob data,test probabilty df,test catogorical data,training conditional probabilty of
                catogorical data,ytrain data
    
    Return :- Array of predicted value.
    
    '''
    
    p=0
    d={}
    xtest_prob={}
    unq_ytrain=y_train.unique()
    Ypred=[]
    val=''
    
    for index,row in xtestcat_cont.iterrows():
    
       
        # Iterating to every unique values of output column
        for val in unq_ytrain:
        
            prob=1
        
                
            #Iterating through every col in xtestcat_cont row:
            for ind in row.index:
                
                   
                if ind in(xtest_cont_df.columns):
                    prob=xtest_cont_df[val].loc[index]
                    
            
                    prob=prob*p
                
            # Check whether that row exist in training contionus data or in training catogorical data.According handle.
            
                if ind in(xtest_cat_data.columns):
                    
                
                    d=xtrain_cat_prob[ind]
                    
               
                    p=d[row.get(ind)+val]
                    prob=prob*p
                    
                xtest_prob.update({val:prob})
                ##Get the class with highest probability.This will be prediction of the given row.
        max_v = max(zip(xtest_prob.values(), xtest_prob.keys()))
        Ypred.append(max_v[1])
    return Ypred
  

In [10]:
def accuracyfunc(ypredect,ytest):
    
    '''
    The function is to calculate the accuracy score with predicted data and actual labels.
    
    Argument :- 
    ypredect  :-List of prediction for every row of validation data.
    y_test    :- Actual class for every row of validation data.
    
    Return :- 
    accuracy  :- Accuracy score of the algorithm
    '''
    
    ytest=ytest.tolist()

    ytestrow=len(ytest)

    cntrow=0
    accuracy=0
    
    for x,y in zip(ytest,ypredect):
        
        if x==y:
            cntrow=cntrow+1
  
    accuracy=cntrow*100/ytestrow
    return accuracy

In [11]:
#Read data in pandas data frame
df=read_data()

In [12]:
#Train-test split the data into training and testing data.
x_train,x_test,y_train,y_test=preprocess_data(df)

In [13]:
#Store continuous and categorical columns of training and testing data into different data frames.
xtrain_cont_df,xtrain_cat_df,xtrain_cont_data,xtrain_cat_data,xtest_cont_data,xtest_cat_data,xtest_cat_df=train_cat_cont_split(x_train,y_train,x_test,y_test)


In [14]:
#Built a training model.Find the std and mean for every column|class.Find the probability of continuous data of testing data.
xtrain_cont_data,xtest_cont_df=Gaussian_std_mean(xtrain_cont_data,y_train,xtest_cont_data)

In [15]:
#Concatenate the prediction of continuous data of testing with categorical data of testing.
xtest_cat_data=xtest_cat_data.reset_index(drop=True)
xtest_cont_df.index=xtest_cat_data.index
xtestcat_cont=pd.concat([xtest_cat_data,xtest_cont_df],axis=1)

In [16]:
#Calculate probability of every unique value of Y data.
class_dict,unqout=class_prob(y_train)

In [17]:
#Calcualate probability of categorical columns of training data.
xtrain_cat_prob=train_cat_prob(xtrain_cat_data,y_train,xtrain_cat_df)

In [18]:
#Predict the test data
ypred=predict_test(xtestcat_cont,xtest_cont_df,xtest_cat_data,xtrain_cat_prob,y_train)

In [19]:
#Find out the accuracy score.
accuracy_val=accuracyfunc(ypred,y_test)

In [20]:
accuracy_val

79.752277612857