In [None]:
'''
The code is to handle categorical and continuous data of Adult dataset using K-nearest Neighbors. Both types of data require
different ways of handling when used with this classifier. The continuous data is transformed into Zcore value. The categorical
data is transformed using the scikit library LabelEncoder.The method to run the test data varies for both continuous and
categorical data, for continuous data we use Euclidean Distance and for categorical we use Hamming Distance. 
The Euclidean distance works on the principle of the square root of the sum of the squared distance of the points from each other
(SQUARE ROOT OF (X1-X2)**2 +(Y1-Y2)**2).Where has in Hamming distance, between two strings of equal length is the number of 
positions at which the corresponding symbols are different.
Finally, both the distances are added to get the k-nearest(k-nearest in this code is implemented manually,3 smallest distance)
and mode of k-nearest as the final prediction. The accuracy score is calculated with predicted and actual Y data.

Details of the above are given below.

Data is read into a pandas data frame. The columns having '?' are replaced with the mode of those columns. The data is then 
split into X(input) and Y(output/class).This X and Y are used to train-test split the data into training and testing data with 
their respective Y data. The training and testing data are separated respectively into a continuous and categorical data frame.

For continuous data, for every column of training data std and mean is calculated and stored in a dictionary. This data is used
to transform training data columns to zcore value. The continuous data of testing data is transformed on std and mean of training
data.

For categorical data, training data is passed over the object of LabelEncoder.The data is fit to get the labels of the column.
The data is then transformed using these labels.The categorical data of testing data is also transformed on the labels of
training data.
In case if there are labels in the test which are not available in training, in that case, we should fit the training data 
based on unique values of columns of the complete data set.

Once both training and test are transformed we now calculate the distance of a row of testing data with all rows of training
data ie as an output 1 row will correspond to = 1 row of test and columns= rows of training data.

For continuous data, we use the Euclidean Distance metric and for categorical data,
we use the Hamming Distance metric.
Both the data are stored in 2 different numpy arrays.We need 1 value to predict, therefore, we add the rows of continuous
and categorical distance array's. The new array is now iterated row-wise.Each row is converted into series with 
index value = Y train index. From this series, we find the k-nearest. From the k-nearest, we find the mode. The corresponding
mode location in the series and Y-train are matched. The label at this matching index in Y train is the prediction for that row.
Similarly, we get a prediction for all rows of the test data in this manner.
The final populated list of test predictions and actual Y data are used to calculate the accuracy score.


'''

In [2]:
import pandas as pd
import numpy as np
import time
from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import euclidean_distances
from sklearn import preprocessing
from scipy.spatial import distance
from sklearn.neighbors import DistanceMetric
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)



In [3]:
def read_data():
    
    '''
    This function is to read data using Pandas read_csv function and convert into dataframe.
    The '?' values of the columns are replaced with the mode value of those columns.
    
    Return :- 
    data :- Dataframe of adult data set
    
    '''
    
    data=pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data',names=['Age','Workclass','FNLWGT','Education','Education-Num','Marital Status','Occupation','Relationship','Race','Sex','Capital Gain','Caplital Loss','Hrs-Per-Week','Native-Country','Sal'])
    for col in data.columns:

        data[col].replace(' ?',data[col].mode()[0],inplace=True)

    return data


def split_data(df):
    
    '''
    The function is to split data first into the Input (X) and Output (Y) dataset. The X data is further train-test split into
     x_train,y_train,x_test,y_test data.
    
    Argument :-
    df :- Adult data frame
    
    Return :- 
    xtrain:-training data
    xtest :- Validation data
    ytrain :- Label data of training data
    ytest :- label data of validation data
    cat_data :- Categorical columns of complete data set before split.

    
    '''
    #Create a dataframe of only Categorical data before split.This is used for LabelEncoder library.
    cat_data=df[['Workclass','Education','Marital Status','Occupation','Relationship','Race','Sex','Native-Country']]
    
       
    
    x=df.iloc[:,0:14]
    y=df.iloc[:,-1]
    
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=0)
    
    return xtrain,xtest,ytrain,ytest,cat_data


def process_data(x_train,x_test):
    
    '''
    The function is separate train and test data into categorical and continous data respectively.
    
    Argument:-
    x_train :- Training data.
    x_test  :- Testing data
 
    
    Return :-
    xtrain_cat  :- Categorical data of training data set
    xtrain_cont :- Continuous data of training data set.
    xtest_cat :-  Categorical data of test data.
    xtest_cont :- Continuous data of test data.
    '''
    # Training data is separated into categorical and continuous data
    
    xtrain_cat=x_train[['Workclass','Education','Marital Status','Occupation','Relationship','Race','Sex','Native-Country']]
    xtrain_cont=x_train[['Age','FNLWGT','Education-Num','Capital Gain','Caplital Loss','Hrs-Per-Week']]
    
    # Testing data is seprated categorical and continuous data
    
    xtest_cat=x_test[['Workclass','Education','Marital Status','Occupation','Relationship','Race','Sex','Native-Country']]
    xtest_cont=x_test[['Age','FNLWGT','Education-Num','Capital Gain','Caplital Loss','Hrs-Per-Week']]
    
    
    return xtrain_cat,xtrain_cont,xtest_cat,xtest_cont



def cont_cal_stdmean(data):
    
    '''
    This function is used to  calculate std and mean of every column of training data.
    
    Argument :-
    data :- traning data set
    
    Return :-
    stdmean :- Dictionary holding std and mean for each column.        
    '''
    
    
    stdmean={}
    for col in data.columns:
        
        mean_data=data[col].mean()
        stdmean.update({'Mean'+col:mean_data})
        std_data=data[col].std()
        stdmean.update({'Std'+col:std_data})
        
    
    return stdmean


def cont_transform_data(datatotransform,std_mean):
    
    '''
        This function is used to transform and replace the training column values to Zscore values using the dictionary of
        std and mean for every column. Both training and test data transformed.Testing data are transformed upon std and mean of
        training data.
        
        Argument:- 
        std_mean=Std and mean of every column of training data.
        datatotransform=training data/testing data
    
        Return:
        datatotransform=Transformed training/test data.
    
       
    '''
    
    
    for col in datatotransform.columns:
        datatotransform[col]=((datatotransform[col]-std_mean['Mean'+col])/std_mean['Std'+col])
    
    return datatotransform



def cont_distance_cal(xcont_test,xcont_train):
    
    '''
    The function is used to calculate the Euclidean distance between training and testing data.
    
    Argument:-
    xcont_test :- Continuous data of test data set
    xcont_train :- Continuous data of training data set
    
    Return :-
    contdist_arr :- Array of distance. For each row of testing, there are corresponding all rows of training as columns.
    '''
       
    contdist_arr=euclidean_distances(xcont_test,xcont_train)
    
    
    return contdist_arr



def cat_label_encode(cato_data,datatotransform):
    
    '''
    The function is to convert the categorical data into integer format. The LabelEncoder method is used to generates labels
    for every unique value of the columns. The data is transformed into these label values. The training data is fit and transformed
    with LabelEncoder object. On the same object, the test data is also transformed.
    
    
    Argument:- 
    cato_data=complete catogorical dataset
    datatotransform=training/test data
    
    Return :-
    datatotransform :- Transformed training/test data
    
    '''
    
    le = preprocessing.LabelEncoder()
    
    
    for x in cato_data.columns:
        
        
        le.fit(cato_data[x].unique())
        datatotransform[x]=le.transform(datatotransform[x])
       
        
    return datatotransform


    
    

def cat_hamming_dist(cat_train,cat_test):
    
    '''
    The function is to calculate the Hamming distance between training and testing categorical data.
    The DistanceMetric object is created with 'hamming' as an argument.
    Object use's pairwise function to calculate the distance of every test row with all rows of training data.
    
    Argument :-
    cat_train :- Categorical data of training data set.
    cat_test :- Categorical data of testing data set.
    
    Return :-
    dist_arr :- Array of distance. For each row of testing, there are corresponding all rows of training as columns.
    '''
    
    
    dist=DistanceMetric.get_metric('hamming')
    
    dist_arr=dist.pairwise(cat_test,cat_train)
    
    return dist_arr




def get_predict(total_distance,y_train):
    
    '''
    The added rows of continuous and categorical distances are passed row-wise.Each row is converted into series with 
    index value = Y train index.From this series, we find the k-nearest. From the k-nearest, we find the mode.The corresponding
    mode location in the series and Y-train are matched. The label at this matching index in Y train is the prediction for 
    that row.k-nearest =3 is used in the code.
    
    Argument :-
    total distnace :- Row containg added values of categorical and continuos distances
    y_train     :- Y data
    
    Return :-
    ypred :- Predication for every row.
    
    '''
    
    
    list_ser=pd.Series(total_distance,index=y_train.index.values)
    
      
    list_ser=pd.Series(list_ser).nsmallest(3)
    ypred=y_train.loc[list_ser.keys()].mode().values.item(0)
    
    
    return ypred



def pred_accuracy(y_pred_final,y_test):
    
    '''
    The function is to calculate the accuracy score with predicted data and actual labels.
    
    Argument :- 
    y_pred  :-List of prediction for every row of validation data.
    y_test    :- Actual class for every row of validation data.
        
    '''
    
    
    score=(sum(y_pred_final==y_test) *100) /y_test.size
    print(score)


In [4]:
#Read data into pandas data frame.
df=read_data()

In [5]:
#Train test split the data.
x_train,x_test,y_train,y_test,catogorical_data=split_data(df)

In [6]:
#Separate categorical and continuous data of training and testing data respectively.
xcat_train,xcont_train,xcat_test,xcont_test=process_data(x_train,x_test)


In [7]:
#Calculate std and mean of every column of training data.
xtrain_cont_stdmean=cont_cal_stdmean(xcont_train)

In [None]:
# Tranform continuous data of training to Zcore value.
xcont_train=cont_transform_data(xcont_train,xtrain_cont_stdmean)

In [None]:
# Transform continuous data of testing to Zscore over std and mean of training data.
xcont_test=cont_transform_data(xcont_test,xtrain_cont_stdmean)

In [10]:
#Calculate Euclidean distance between transformed continuous test data and training data.
cont_dist_arr=cont_distance_cal(xcont_test,xcont_train)

In [None]:
#Tranform categorical data of training to integer format using LabelEncoder library.
xcat_train=cat_label_encode(catogorical_data,xcat_train)

In [None]:
#Tranform categorical data of test to integer format using LabelEncoder library.
xcat_test=cat_label_encode(catogorical_data,xcat_test)

In [13]:
#Calculate Hamming Distance between transformed categorical data of test and training.
cat_dist_arr=cat_hamming_dist(xcat_train,xcat_test)

In [None]:
#Add the rows of continuous and categorical distances.Convert the result into numpy array.
result = map(lambda row,row1: np.add(row,row1), cat_dist_arr,cont_dist_arr)
total_distance=np.array(list(result))

#Pass each row of numpy array to get_predict function to predict a class for every row of test data.
y_pred=np.apply_along_axis(get_predict,1,total_distance,y_train)

#Convert the final predication of test data into series object
y_pred_final=pd.Series(y_pred,index=x_test.index.values)



In [17]:
#Calculate the Accuracy score
pred_accuracy(y_pred_final,y_test)

81.5743678984543
