In [None]:
'''
The adult dataset was used to study a way of handling continuous data. The approach to handling continuous column data
was by using the Gaussian Naive Bayes classifier algorithm. It is specially used for features having continuous values.
The adult dataset contains both categorical and continuous types of columns.
The accuracy score will be calculated on only continuous columns. 


Data is read using pandas function and converted into a data frame. The data is preprocessed to replace the '?' values of the 
columns with the mode values of those columns. Thereafter the categorical columns are separated from continuous columns.

The continuous columns are split into X and Y data. The X data is then train-test split into training and testing data with 
their respective Y data. The training data is then fit and transformed over the GaussianNB library object to get the mean and 
standard variation. Validation data is then predicted over the training model to get a list of predictions.

Finally, the accuracy score is calculated with the predicted and actual labels.



'''

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from IPython.display import display
pd.set_option('display.max_columns', None)



In [3]:
def read_data():
    
    '''
    This function is to read data using Pandas read_csv function and convert into dataframe.
       
    Return :- 
    data :- Dataframe 
    '''
    
    data=pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data',names=['Age','Workclass','FNLWGT','Education','Education-Num','Marital Status','Occupation','Relationship','Race','Sex','Capital Gain','Caplital Loss','Hrs-Per-Week','Native-Country','Sal'])
    
    return data



def format_data(df):
    
    '''
     The function is for data preprocessing. The '?' values of the columns are replaced with the mode value of those columns.
    
    The categorical columns of the data frame are dropped.
    
    Argument :- 
    df :- DataFrame
    
    Return :- 
    
    df :- modified dataframe.
    '''
    
    for col in df.columns:

        df[col].replace(' ?',df[col].mode()[0],inplace=True)
    
    df.drop(columns=['Workclass','Education','Marital Status','Occupation','Relationship','Race','Sex','Native-Country'],inplace=True)
    
    return df




def split_data(dfnew):
    
    '''
    The function is to split data first into the Input (X) and Output (Y) dataset. The X data is further train-test split into
     x_train,y_train,x_test,y_test data.
    
    Arguemnt :- 
    dfnew:-modified Dataframe
    
    Return :- 
    x_train:-training data
    x_test :- Validation data
    y_train :- Label data of training data
    y_test :- label data of validation data
    '''
    

    x=dfnew.iloc[:,:-1]
    y=dfnew.iloc[:,-1]
    x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=.3, random_state=0)
    
    return x_train,x_test,y_train,y_test




def gaussianNB(x_train,y_train):
    
    '''
    The function is to fit and transform training data on GaussionNB object.
    
    Argument :-
    x_train :- training data
    y_train :- label data of training data
    
    Return :- 
    clf :- Object of GaussianNB
    '''
    
    clf = GaussianNB()
    clf.fit(x_train,y_train)
    
    return clf



def pred_accuracy(clf_pred,x_test,y_test):
    
    '''
    The function is to run the test data over the model and predict the test data. It also calculates the accuracy score with 
    predicted data and actual labels.
    
    Arguemnt :- 
    clf_pred :-Object of GaussianNB
    
    Return :- Accuracy Score
    '''
    y_pred_class=clf_pred.predict(x_test)
    print(metrics.accuracy_score(y_test, y_pred_class))



    
    

In [4]:
df=read_data()

In [5]:
continous_df=format_data(df)

In [6]:
xtrain,xtest,ytrain,ytest=split_data(continous_df)

In [7]:
clf_pred =gaussianNB(xtrain,ytrain)

In [8]:
pred_accuracy(clf_pred,xtest,ytest)

0.7975227761285699
