In [268]:
import numpy as np
import bottleneck
import pandas as pd
from pandas import DataFrame, read_csv
from bokeh.models import ColumnDataSource, Select
from bokeh.layouts import widgetbox, row, column, layout
from bokeh.plotting import figure
from bokeh.io import output_notebook, show, output_file, curdoc
from bokeh.plotting import figure
from sklearn import cluster, datasets, mixture
from sklearn.cluster import KMeans,DBSCAN,Birch
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.cluster import KMeans,DBSCAN,Birch

def label_encoding(df,df2):
    #select columns with categorical values
    obj_df = df.select_dtypes(include=['object']).copy()
    #checking for missing values
    miss = obj_df[obj_df.isnull().any(axis=1)]
    #print(miss)
    
    #label  and store in df2
    for item in list(obj_df.columns):
        obj_df[item] = obj_df[item].astype('category')
        df2[item]=obj_df[item].cat.codes

def KMeansclustering(df2,features_selected,clusters):
    model=KMeans(n_clusters=clusters,random_state=0)
    model_fit = model.fit_predict(df2[features_selected])
    return model

def KNN_predictor(x_train,y_train,neighbours):
    clf = KNeighborsClassifier(n_neighbors=neighbours)
    clf = clf.fit(x_train,y_train)
    return clf

def get_top_k_corr_attributes(arr,k):
    array = np.array(arr)
    array = [abs(number) for number in array]
    array = np.array(arr)
    temp = bottleneck.argpartition(-array, k+1)[:k+1]
    temp = np.array(temp)
    temp = temp[0]
    top_k_corr_attr = temp[:k+1]
    return top_k_corr_attr

#read the data
df = pd.read_csv('nutrition_raw_anonymized_data.csv')
print (df.shape)
df2 = df.copy()
#convert category data tp binary data
label_encoding(df,df2)

#list of all the columns    
attribute_list= (list(df2.columns))


#compute correlation matrix
corr_mat = df2.corr() 
#corr_mat[1:2] gives correlations for cancer
n_top_attributes = 10
#select top correlated attributes for Cancer
corr_attr_list = get_top_k_corr_attributes(corr_mat[1:2],n_top_attributes)
features_index = corr_attr_list
#get list of top features in string list format
features_list = df2.columns[features_index]

#slice the dataset for training
db = df2[features_list]
x_train = db.loc[:40,:]
y_train = df2.loc[:40,attribute_list[1]]
print (x_train.shape,y_train.shape)
#model for prediction (KNN classifier)
clf = KNN_predictor(x_train,y_train,3)
x_test =   db.loc[:,:]
y_hat = clf.predict(x_test)
y_hat = list(y_hat)

#plot to visualise predictions
x1=df2[features_list[1]]
y1=df2[features_list[2]]
a=figure(plot_height=500, plot_width=700, title='KNN for Cancer')

actual_data_yes = ColumnDataSource(data=dict(x=x1.loc[df2['cancer']==1],
                                             y=y1.loc[df2['cancer']==1]))
actual_data_no = ColumnDataSource(data=dict(x=x1.loc[df2['cancer']==0],
                                             y=y1.loc[df2['cancer']==0]))
a.square('x','y',fill_color='green',line_color='green',source=actual_data_no,legend='Actual Not cancer',
         alpha=0.4,size=10)
a.circle('x','y',fill_color='firebrick',line_color='firebrick',source=actual_data_yes, legend='Actual Yes cancer',
         alpha=0.4,size=10)

#get indices of predicted values for Cancer=yes, Cancer=No
indices_yes = [i for i, x in enumerate(y_hat) if x == 1]
indices_no = [i for i, x in enumerate(y_hat) if x == 0]

predict_data_yes = ColumnDataSource(data=dict(x=x1.loc[indices_yes],
                                             y=y1.loc[indices_yes]))
predict_data_no = ColumnDataSource(data=dict(x=x1.loc[indices_no],
                                             y=y1.loc[indices_no]))
a.diamond('x','y',fill_color='blue',line_color='blue',source=predict_data_no,legend='Predicted Not cancer',
          fill_alpha=1,size=5)
a.triangle('x','y',fill_color='firebrick',line_color='firebrick',source=predict_data_yes, legend='Predicted Yes cancer', 
          fill_alpha=1,size=5)

a.xaxis.axis_label = features_list[1]
a.yaxis.axis_label = features_list[2]
show(a)



def get_colors(model):
    colors=[]
    for i in model.labels_:
        if i==0:
            colors.append('red')
        if i==1:
            colors.append('blue')
    return colors

def stack_bar_cancer(df2,attr1):
    categories=[]
    cancer_yes=[]
    cancer_no=[]
    for name,item in (df2.groupby([attr1])):
        categories.append(name)
        cancer_yes.append(len(item.loc[df2['cancer']==1]))
        cancer_no.append(len(item.loc[df2['cancer']==0]))

    cncer = ['yes', 'no']

    actualdata_cancer = {attr1 : categories,
           'yes'   : cancer_yes,
           'no'   : cancer_no,
           }
    
    r = figure(x_range=categories, plot_height=250,plot_width=300, title="Cancer")
    r.vbar_stack(['yes','no'], x=attr1, width=0.5, color=['red','blue'], source=ColumnDataSource(actualdata_cancer),
                legend=["%s " % x for x in ['yes','no']])
    #r.vbar_stack(['yes','no'],x='cat', width=0.9, color=GnBu3, source=ColumnDataSource(actualdata2))
    r.ygrid.grid_line_color = None
    r.xaxis.axis_label = attr1+ str('  (0-No\n 1-Yes)')
    r.legend.location='top_center'
    show(r)
    return r

def stack_bar_diabetes(df2,attr1):
    categories=[]
    diabetes_yes=[]
    diabetes_no=[]
    for name,item in (df2.groupby([attr1])):
        categories.append(name)
        diabetes_yes.append(len(item.loc[df2['diabetes']==1]))
        diabetes_no.append(len(item.loc[df2['diabetes']==0]))

    actualdata_diabetes = {attr1 : categories,
           'yes'   : diabetes_yes,
           'no'   : diabetes_no,
           }
    
    q = figure(x_range=categories, plot_height=250,plot_width=300, title="Diabetes")
    q.vbar_stack(['yes','no'], x=attr1, width=0.5, color=['red','blue'], source=ColumnDataSource(actualdata_diabetes),
                legend=["%s " % x for x in ['yes','no']])
    #r.vbar_stack(['yes','no'],x='cat', width=0.9, color=GnBu3, source=ColumnDataSource(actualdata2))
    q.ygrid.grid_line_color = None
    q.legend.location='top_center'
    q.xaxis.axis_label = attr1 + str('  (0-No\n 1-Yes)')
    show(q)
    return q
    
def stack_bar_heart_disease(df2,attr1):
    categories=[]
    heart_disease_yes=[]
    heart_disease_no=[]
    for name,item in (df2.groupby([attr1])):
        categories.append(name)
        heart_disease_yes.append(len(item.loc[df2['heart_disease']==1]))
        heart_disease_no.append(len(item.loc[df2['heart_disease']==0]))

    actualdata_heart_disease = {attr1 : categories,
           'yes'   : heart_disease_yes,
           'no'   : heart_disease_no,
           }
    
    s = figure(x_range=categories, plot_height=250,plot_width=300, title="Heart Disease")
    s.vbar_stack(['yes','no'], x=attr1, width=0.5, color=['red','blue'], source=ColumnDataSource(actualdata_heart_disease),
                legend=["%s " % x for x in ['yes','no']])
    #r.vbar_stack(['yes','no'],x='cat', width=0.9, color=GnBu3, source=ColumnDataSource(actualdata2))
    s.ygrid.grid_line_color = None
    s.xaxis.axis_label = attr1+ str('  (0-No\n 1-Yes)')
    s.legend.location='top_center'
    show(s)
    return s
    


r=stack_bar_cancer(df2,'dog')
q=stack_bar_diabetes(df2,'cat')
s=stack_bar_heart_disease(df2,'diabetes')

#KMeans clustering
model = KMeansclustering(df2,features_list,2)
colorKMeans = get_colors(model)
#plot
p=figure(plot_height=200, plot_width=500, title='KMeans')
actual_data1 = ColumnDataSource(data=dict(x=df2.loc[:,attribute_list[1]],y=df2.loc[:,attribute_list[2]], colors=colorKMeans))
p.circle('x','y',fill_color='colors',line_color='colors',source=actual_data1)
p.xaxis.axis_label = attribute_list[1]
p.yaxis.axis_label = attribute_list[2]

#widgets
x_attribute_select_stackedbars = Select(value='Cancer',
                          title='Select X attribute:',
                          width=200,
                          options=list(df.select_dtypes(include=['object']).columns))

x_attribute_select = Select(value='Cancer',
                          title='Select X attribute:',
                          width=200,
                          options=attribute_list)

y_attribute_select = Select(value='Heart Disease',
                          title='Select Y attribute:',
                          width=200,
                          options=attribute_list)

x_attribute_predictor = Select(value=features_list[0],
                          title='Select X attribute:',
                          width=200,
                          options=list(features_list))
y_attribute_predictor = Select(value=features_list[1],
                          title='Select Y attribute:',
                          width=200,
                          options=list(features_list))



#callbacks
def update_x_attribute_stackedbars(attrname,old,new):
    attr_new = x_attribute_select_stackedbars.value
    print (attr_new)

    r=stack_bar_cancer(df2,attr_new)
    q=stack_bar_diabetes(df2,attr_new)
    s=stack_bar_heart_disease(df2,attr_new)
    l=layout([
    [row(r,q)],
    [row(s)],
    [row(x_attribute_select_stackedbars)],
    [row(p)],
    [row(x_attribute_select,y_attribute_select)]
         ])
    curdoc().add_root(l)
    
def update_x_attribute_predictor(attrname,old,new):
    x1=df2[x_attribute_predictor.value]
    y1=df2[y_attribute_predictor.value]
    a.xaxis.axis_label = x_attribute_predictor.value
    actual_data_yes = ColumnDataSource(data=dict(x=x1.loc[df2['cancer']==1],
                                             y=y1.loc[df2['cancer']==1]))
    actual_data_no = ColumnDataSource(data=dict(x=x1.loc[df2['cancer']==0],
                                             y=y1.loc[df2['cancer']==0]))
    predict_data_yes = ColumnDataSource(data=dict(x=x1.loc[indices_yes],y=y1.loc[indices_yes]))
    predict_data_no = ColumnDataSource(data=dict(x=x1.loc[indices_no],y=y1.loc[indices_no]))
    
    a.square('x','y',fill_color='green',line_color='green',source=actual_data_no,legend='Actual Not cancer',
         alpha=0.4,size=10)
    a.circle('x','y',fill_color='firebrick',line_color='firebrick',source=actual_data_yes, legend='Actual Yes cancer',
         alpha=0.4,size=10)
    a.diamond('x','y',fill_color='blue',line_color='blue',source=predict_data_no,legend='Predicted Not cancer',fill_alpha=1,size=5)
    a.triangle('x','y',fill_color='firebrick',line_color='firebrick',source=predict_data_yes, legend='Predicted Yes cancer', fill_alpha=1,size=5)

def update_y_attribute_predictor(attrname,old,new):
    x1=df2[x_attribute_predictor.value]
    y1=df2[y_attribute_predictor.value]
    a.yaxis.axis_label = y_attribute_predictor.value
    actual_data_yes = ColumnDataSource(data=dict(x=x1.loc[df2['cancer']==1],
                                             y=y1.loc[df2['cancer']==1]))
    actual_data_no = ColumnDataSource(data=dict(x=x1.loc[df2['cancer']==0],
                                             y=y1.loc[df2['cancer']==0]))
    predict_data_yes = ColumnDataSource(data=dict(x=x1.loc[indices_yes],y=y1.loc[indices_yes]))
    predict_data_no = ColumnDataSource(data=dict(x=x1.loc[indices_no],y=y1.loc[indices_no]))

    a.square('x','y',fill_color='green',line_color='green',source=actual_data_no,legend='Actual Not cancer',
         alpha=0.4,size=10)
    a.circle('x','y',fill_color='firebrick',line_color='firebrick',source=actual_data_yes, legend='Actual Yes cancer',
         alpha=0.4,size=10)
    a.diamond('x','y',fill_color='blue',line_color='blue',source=predict_data_no,legend='Predicted Not cancer',fill_alpha=1,size=5)
    a.triangle('x','y',fill_color='firebrick',line_color='firebrick',source=predict_data_yes, legend='Predicted Yes cancer', fill_alpha=1,size=5)

    
def update_x_attribute(attrname,old,new):
    selected_x_attribute_list=df2.loc[:,x_attribute_select.value]
    selected_y_attribute_list=df2.loc[:,y_attribute_select.value]
    p.xaxis.axis_label = x_attribute_select.value   
    actual_data1.data = dict(x=selected_x_attribute_list,y=selected_y_attribute_list,colors=colorKMeans)

def update_y_attribute(attrname,old,new):
    selected_x_attribute_list=df2.loc[:,x_attribute_select.value]
    selected_y_attribute_list=df2.loc[:,y_attribute_select.value]
    p.yaxis.axis_label = y_attribute_select.value   
    actual_data1.data = dict(x=selected_x_attribute_list,y=selected_y_attribute_list,colors=colorKMeans)
    
x_attribute_select.on_change('value', update_x_attribute)
y_attribute_select.on_change('value', update_y_attribute)
x_attribute_select_stackedbars.on_change('value',update_x_attribute_stackedbars)
x_attribute_predictor.on_change('value',update_x_attribute_predictor)
y_attribute_predictor.on_change('value',update_y_attribute_predictor)

#show
l=layout([
    [row(r,q)],
    [row(s)],
    [row(x_attribute_select_stackedbars)],
    [row(p)],
    [row(x_attribute_select)],[row(y_attribute_select)],
    [row(a)],
    [row(x_attribute_predictor)],[row(y_attribute_predictor)]
         ])
curdoc().add_root(l)


(54, 1093)
(41, 7) (41,)
