In [None]:
'''
The code is to see how we can handle categorical data using the Adult Data set. The code also implements Laplace smoothing
to avoid 0 probability.
After reading the data in the data frame, continuous columns are dropped from the data frame. leaving only categorical columns.
After that, all columns having '?' as value is replaced with the mode of that column.
The data is split in X(input) and Y(output/label)data.The X data and Y data are then train-tet split to get training and testing 
data.
For probability, for every unique value of Y data, the probability is calculated and populated in a dictionary object.
The probability of the categorical columns of the training data set is calculated. The data set is iterated over every column.
For every unique value of the column, conditional probability is calculated ie column|unique class eg(Red|Yes,Red|No...etc).
This data is stored in an dictionary object against every column ({'Color': {'YellowYes': 0.5,'YellowNo': 0.57...etc).

The model is now ready to predict the test data. The test data is iterated over row. For every row, the corresponding columns
are iterated. The column|unique class probability for all columns is obtained from the above dictionary, all probability are
multiplied to get a cumulative probability for a given class. The final probability for that row is multiplied with
the probability of that class The same process is repeated for every unique class. Thus every row will have as many probabilities
as a unique class in Y data. The class which has the maximum probability will be the prediction for that row. 
All prediction is stored in a list

The accuracy score is calculated with the above-predicted data and actual class values.

'''

In [2]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split



In [3]:
def read_data():
    
    '''
    This function is to read data using Pandas read_csv function and convert into dataframe.
    
      
    Return :- 
    data :- Dataframe
    '''
    
    data=pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data',names=['Age','Workclass','FNLWGT','Education','Education-Num','Marital Status','Occupation','Relationship','Race','Sex','Capital Gain','Caplital Loss','Hrs-Per-Week','Native-Country','Sal'])
    
    
    return data
    
    
    

In [4]:
def preprocess_data(df):
    
    '''
    This code is used work on catogorical data. Therefore numerical columns are dropped before processing further.
    The categoriacal column '?' value is replaced with mode of that column.
    
    Argument :-
    df :- Data frame holding Adult data set
    
    Return :-
    df :- Data frame holding categorical columns of Adult data set.
    
    '''
    
    
    df.drop(columns=['Age','FNLWGT','Education-Num','Capital Gain','Caplital Loss','Hrs-Per-Week'],inplace=True)
    
    
    for col in df.columns:

        df[col].replace(' ?',df[col].mode()[0],inplace=True)
        
        
    return df    

In [5]:
def train_test(dfnew):
    
    '''
    The function is to split data first into the Input (X) and Output (Y) dataset. The X data is further train-test split into
     x_train,y_train,x_test,y_test data.
    
    Argument :-
    dfnew :-Modified adult dataframe
    
    Return :- 
    xtrain:-training data
    xtest :- Validation data
    ytrain :- Label data of training data
    ytest :- label data of validation data
    
    '''
    
    
    x=dfnew.iloc[:,:-1]
    y=dfnew.iloc[:,-1]
    
    
    xtrain, xtest, ytrain, ytest = train_test_split(x, y,test_size=.3, random_state=0)
    
    return xtrain,xtest,ytrain,ytest
    

In [6]:
def prob_class(y_train):
    
    '''
   This function is to calculate the probability of unique values of the Output data.
    
    Argument :- 
    y_train :- Y train data.
    
    Return :-  
    inp_dict :-Dictionary holding probabilty of each unique value of output data.
    unq_inp :- Unique value of Y data.
    
    
    '''
    
    #Find unique value of Output column and rowcount of X_train data   
    rowcnt=x_train.shape[0]
    unq_inp=y_train.unique()
    inp_dict={}

# Getting probablity of ouput column for every unqiue values
    for k in unq_inp:
        prob=y_train[y_train==k].shape[0]/rowcnt
        inp_dict.update({k:prob})
        
        
        
    return unq_inp,inp_dict
    


In [7]:

def train_cat_prob(x_train,y_train,unqinp,inpdict):
    
    '''
    This function is to calculate the probability of categorical data of training eg Red nYes,Red nNo etc.
    The conditional probability is calculated from every unique value of column | unique class (Red|yes.Red|No..etc).The Laplace
    smoothing is considered while calculating the probabilities.
    This data is stored in an dictionary object against every column ({'Color': {'YellowYes': 0.5,'YellowNo': 0.57,'RedYes': 0.5,..
    ....etc)
    
    Argument:
    x_train:-training data
    y_train :- Y data
    unqinp :- Unique value of Y data.
    inpdict :-Dictionary holding probabilty of each unique value of output data.
    
    Return:-
    finalans :- Dictionary holding column|class probability.
    
    
    
    '''
    concatdf=pd.concat([x_train,y_train],axis=1)
    
    finalans={}
 
    result=0
    prob=0

    
 
    
    for col in x_train.columns:
        colunq=x_train[col].unique()
    
        newdict={}
    
    
        for val in colunq:
       
        
            for val1 in unqinp:
            
                result = (concatdf[(concatdf[col]==val) & (concatdf[y_train.name]==val1)].shape[0]+1)
                prob=result/(concatdf[concatdf[y_train.name]==val1].shape[0]+len(unqinp))
            
                newdict.update({val+val1:prob})
        
        finalans.update({col:newdict})
    

    return finalans
    
    

In [8]:
def finalypred(x_test,resultdata):
    
    '''
    The function is to predict the test data. The test data is iterated over row. For every column of the row, the respective
    column|class probability is obtained from the dictionary holding all probabilities. All column|class probability is
    multiplied to get a cumulative probability.The final probability for that row is multiplied with probabilty of that class.
    The same process is repeated for every unique class. Thus every row will have as many probabilities as a unique class in Y 
    data. The class which has the maximum probability will be the prediction for that row. All prediction is stored in a list
    
    Argument :-
    x_test :- test data.
    resultdata :- Dictionary holding all column|class probability.
    
    Return :-
    Ypred :- List holding prediction for all rows of testing data.
    
    
    
    '''
    
    p=0
    d={}
    finalprob={}
    prob=1
    Ypred=[]
    

# Iterating every row in x_test data
    for index,row in x_test.iterrows():
  
        # Iterating through unqiue values of Output column
        for val in unqinp:
            prob=1
            
            #Iterating through unqiue columns of input column.
            for col in x_test.columns:
                                
                d=resultdata[col]
                
                if (row.get(col)!=None) & (d.get(row[col]+val)!=None):  # taking into account records whose attribute's value may not be present in train data

                    p=d[row[col]+val]
                                   
                    prob=prob*p   #eg RednY * SportnY *domesticnY
        
            
            prob=prob*inpdict[val]   #eg RednY * SportnY *domesticnY
            finalprob.update({val:prob})
        max_v = max(zip(finalprob.values(), finalprob.keys()))  # find maximum value between Yes and N of the same row
    
        Ypred.append(max_v[1])
        
    return Ypred

In [9]:
def accuracyfunc(ypredect,y_test):
    
    '''
    
   The function is to calculate the accuracy score with predicted data and actual labels.
   
   Argument :-
   ypredect :- Predicted test data.
   y_test   : Y data of test data set.
    
    '''

    ytest=y_test.tolist()

    ytestrow=len(ytest)


    cntrow=0
    accuracy=0
    
    for x,y in zip(ytest,ypredect):
        
        if x==y:
            cntrow=cntrow+1
  
    accuracy=cntrow*100/ytestrow
    print(accuracy)

In [10]:
#Read the data in pandas data frame.
df=read_data()

In [11]:
#Replace all '?' column value 
df=preprocess_data(df)

In [12]:
#Make a copy of original data frame (df)
dfnew=df

In [13]:
#The X data is train-test split into training and testing data
x_train,x_test,y_train,y_test=train_test(dfnew)

In [14]:
#Calculate probability of every unique value of Y data.
unqinp,inpdict=prob_class(y_train)

In [16]:
#Calculate column|class probability of training data set
resultdata=train_cat_prob(x_train,y_train,unqinp,inpdict)

In [17]:
#Run test data over model to obtain prediction for every row.
ypredect=finalypred(x_test,resultdata)

In [18]:
#Calcualte the accuracy score
accuracyfunc(ypredect,y_test)

78.59555737537107
