In [None]:
'''
The code is to handle categorical and continuous data of Adult dataset using K-nearest Neighbors. Both types of data require
different ways of handling when used with this classifier. The continuous data is transformed into Zcore value. The categorical
data is transformed using the scikit library OneHotEncoder into O's and 1's.In OneHotEncoder every unique value of the columns 
are transformed to feature. The row values of the original data set are now transformed into 0 and 1 values. The row in the
new data frame as values for that column as 1 if in the original data frame the row had that is value else will have value 0. 
Thus new data frame will have as many columns as many unique column values in the original data set.

Eg:-Let say a data frame has 2 columns. After OneHotEncoder is applied on the categorical column(Color), it will be transformed
as shown below into a new data frame.
    Color   Price                     
    Red	    2000
    Green   5000
    Yellow  9000
    
Color_Red    Color_Green   Color_Yellow   Price
    1           0               0        2000
    0           1               0        5000
    0           0               1        9000

The details of the algorithm are given below.

1. Data is read into a pandas data frame. The columns having '?' are replaced with the mode of those columns. The data is then 
split into X(input) and Y(output/class).

2. The X data are separated respectively into a continuous and categorical data frame.

3. The categorical data is transformed into integer format using the OneHotEncoder library. The categorical X data 
   is fit and transformed on the object of OneHotEncoder. The output is an array where every unique value of the 
   column is transformed into a feature. The original rows in the array are now replaced with 0's and 1's as values.
   
4. Concatenate transformed categorical and continuous data. Train-test split this data into training and test data.

5. Standardize the continuous column data by transforming and replacing the values with Zscore using the
   StandardScaler library. The training data is fit and transformed on the StandardScaler object. The testing data is
   transformed and replaced with Zscore over the same object on which the training data was fit.
   
6. Create an object of KNeighborsClassifier with n_split=43. Fit the training data on the object of k-nearest.
   This to get the model ready.
   
7.  Run the test data over the model to get the prediction of every row. The knn object is used to predict 
    and calculate the accuracy score.

'''

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [1]:
def read_data():
    
    '''
    This function is to read data using Pandas read_csv function and convert into dataframe.
    The '?' values of the columns are replaced with the mode value of those columns.
    
    Return :- 
    data :- Dataframe of adult data set
    
    '''
    
    data=pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data',names=['Age','Workclass','FNLWGT','Education','Education-Num','Marital Status','Occupation','Relationship','Race','Sex','Capital Gain','Caplital Loss','Hrs-Per-Week','Native-Country','Sal'])
    
    for col in data.columns:

        data[col].replace(' ?',data[col].mode()[0],inplace=True)
        
    data['Native-Country'].replace(' Trinadad&Tobago' ,'TrinadadTobago',inplace=True)
    
    return data

In [4]:
def preprocess_data(df):
    
    '''
    The function is to split data into X  and Y(input and output) data.
    
    Argument :-
    df :- Adult data frame.
    
    Return :-
    x :- X data
    y:- Y data
    
    
    '''
  
    
    x=df.iloc[:,:-1]
    y=df.iloc[:,-1]
    
    
    return x,y

In [6]:
def cat_cont_data_split(xdata):
    
    '''
    The function is to split the X data into categorical and continuous data.
    
    
    Argument :- xdata
    
    Return :- 
    x_cat_data :- Categorical data frame.
    x_cont_data :- Continuous data frame
    
      
    '''
        
    #split xdata data into categorical data

    x_cat_data=xdata[['Workclass','Education','Marital Status','Occupation','Relationship','Race','Sex','Native-Country']]
        
    #split xdata into continuous data

    x_cont_data=xdata[['Age','FNLWGT','Education-Num','Capital Gain','Caplital Loss','Hrs-Per-Week']]
   
    
    return x_cat_data,x_cont_data
    
    

In [8]:
def onehotencode(datatofit):
    '''
   The function is to convert the categorical data into integer format using the OneHotEncoder library. The categorical X data 
   is fit and transformed on the object of OneHotEncoder. The output is an array where every unique value of the 
   columns are transformed into a feature. The original rows in the array are now replaced with 0's and 1's as values.
    
    Argument :- 
    datatofit :-X-catogorical data
    
    Return :- 
    x_data_oe :-Onehot encoded catogorical data.
    oe        :- Object of OneHotEncoder
    
    '''
    
    oe=OneHotEncoder()
    x_data_oe=oe.fit_transform(datatofit).toarray()
    
    return x_data_oe,oe

In [10]:
def concat_cat_cont(xcat_ohe,xcont_data):
    
    '''
    The function is to concatenate transformed categorical and continuous data. 
    
    
    Argument :- 
    xcat_ohe   :- onehotencoder data 
    xcont_data :-Continuous data.
    
    Return :- 
    concat_df :-Dataframe with both catogorical and continuous data.
             
    '''
    xcat_ohe=pd.DataFrame(xcat_ohe)
    concat_df=pd.concat([xcont_data,xcat_ohe],axis=1)
    
    return concat_df

In [12]:
def train_test(concat_cat_cont_df,ydata):
    '''
    The function is to train-test split the dataframe with transformed categorical data and continuous data.
    
    Argument :- 
    concat_cat_cont_df  :- Concatenated cat and cont data,
    ydata               :- Y data
    
    Return :- 
    x_train:-training data
    x_test :- Validation data
    y_train :- Label data of training data
    y_test :- label data of validation data
    
    '''
    
    
    x_train,x_test,y_train,y_test=train_test_split(concat_cat_cont_df,ydata,test_size=0.3,random_state=0)
    
    return x_train,x_test,y_train,y_test
    

In [14]:
def scale_data(fitdata,datatotransform):
    
    '''
    The function is to standardize the continuous column data by transforming and replacing the values with Zscore using the
    StandardScaler library. The training data is fit and transformed on the StandardScaler object. The testing data is
    transformed and replaced with Zscore over the same object on which the training data was fit.
    
    Argument :- 
    fitdata  :- Training data.
    datatotransform :- data to transform,training/testing
    
    Return :- 
    datatransformdf :- Transformed training/test data.
       
    '''
        
    scaler = StandardScaler()
    scaler.fit(fitdata)
    t_data=scaler.transform(datatotransform)
    datatranformdf=pd.DataFrame(t_data,columns=[datatotransform.columns],index=datatotransform.index)
    
    return datatranformdf

In [17]:
def fit_data(datatotransform_df,y_train):
    
    '''
    The function is to fit the training data on the object of KNeighborsClassifier.The n_split=43.This to get the model ready.
    
    Argument :- 
    datatotransform :- training data.
    y_train         :- Y data of training
    
    Return :- 
    clf :- Object of Knn
    
    '''
        
    clf=KNeighborsClassifier(n_neighbors=43)
    clf.fit(datatotransform_df,y_train)
    
    return clf

In [21]:
def pred_accuracy(clf_pred,test_transform_df,y_test):
    
    '''
    The function is to run the test data over the model to get the prediction of every row. The knn object is used to predict 
    and calculate the accuracy score.
    
    Argument :- 
    clf_pred :-Knn object
    test_transform_df :-Test data
    y_test :- Y data of test.
    
        
    '''
    
    print("Test set prediction:{}".format(clf_pred.predict(test_transform_df)))
    print("Test set accuracy:{:.2f}".format(clf_pred.score(test_transform_df,y_test)))
    

In [None]:
#Read the into data frame.
df=read_data()

In [None]:
#Split the data into X and Y data.
xdata,ydata=preprocess_data(df)

In [None]:
#Separate the X data into categorical and continuous data frame.
xcat_data,xcont_data=cat_cont_data_split(xdata)

In [None]:
#Convert the categorical data into integer format using OneHotEncoder.
xcat_ohe,oe_obj=onehotencode(xcat_data)

In [None]:
#Concatenate the transformed categorical data and continuous data.
concat_cat_cont_df=concat_cat_cont(xcat_ohe,xcont_data)

In [None]:
#Train-test split the concatenated data.
xtrain,xtest,ytrain,ytest=train_test(concat_cat_cont_df,ydata)

In [None]:
#Transform the training continuous data into Zscore value
datatransform_df=scale_data(xtrain,xtrain)

In [None]:
#Transform the test continuous data into Zscore value
test_transform_df=scale_data(xtrain,xtest)

In [None]:
#Create a training model.
clf_pred=fit_data(datatransform_df,ytrain)

In [22]:
#Run the test data over model to get prediction for every row.Also get the accuracy score of predicted and actual Y data of test
pred_accuracy(clf_pred,test_transform_df,ytest)

Test set prediction:[' <=50K' ' <=50K' ' <=50K' ... ' >50K' ' <=50K' ' <=50K']
Test set accuracy:0.83
