In [103]:
import numpy as np

#Using Bokeh 0.12.5 and Python 3.6
from bokeh.io import output_notebook, show
from bokeh.models import Select,CustomJS
import warnings
warnings.filterwarnings('ignore')
from bokeh.layouts import row,column
from bokeh.models import HoverTool
from bokeh.plotting import figure, ColumnDataSource
output_notebook()

In [104]:
#Read the entire CSV as str
data_orig = np.genfromtxt('Data/nutrition_raw_anonymized_data.csv',delimiter=",",dtype=str)
headers=data_orig[0,:]
data=data_orig[1:,:]
print(data.shape)

(54, 1093)


## Data Preprocessing
* Column 1 is ID Number for the anonymous patient
* Columns 2,3,4,5 are Cancer,Diabetes,Heart Disease and Belly (Overweight).
* All other columns are patient food or lifestype habits. I aim to establish correlations between these feature columns to the medical conditions.
* Many feature columns exist as Food Frequency and Food Quantity pairs. I assume that it is frequency of having that food per week and the quantity is the quantity taken per meal. Hence, frequency*quantity would give total units of the food consumed per week.
* I would preprocess the data to convert all "Yes,Outie" to 1. (Condition exits) and 0 otherwise. "Outie" belly corresponds to obesity which is a True for obesity.
* I would also convert the data to standard normal form for good correlation results.

In [105]:
#Converting the medical condition columns into binary numpy array
med_condition=np.zeros((data.shape[0],4), dtype=np.int)
for i in range(data.shape[0]):
    for j in range(1,5):
        if(data[i,j]=="Yes" or data[i,j]=="Outie"):
            med_condition[i,j-1]=1

med_condition_name=headers[1:5]


features=np.zeros((data.shape[0],1), dtype=np.float)
curr_index=5
feature_names=[]

#Converting feature columns into numbers, multiplying Frequency and Quantity into one column
while (curr_index<data.shape[1]):
    data_converted=[]
    if(headers[curr_index].find("FREQ") !=-1 and headers[curr_index+1].find("QUAN")!=-1 and headers[curr_index].find("FREQ")==headers[curr_index+1].find("QUAN")):
        for j in range(data.shape[0]):
            data_converted.append(float(data[j,curr_index])*float(data[j,curr_index+1]))
        features=np.concatenate((features,np.array(data_converted).reshape(data.shape[0],1)),axis=1)
        feature_names.append(headers[curr_index].replace("FREQ","TOTAL"))
        curr_index=curr_index+2

    
    else:
        for j in range(data.shape[0]):
            if(data[j,curr_index]=="Yes"):
                data_converted.append(float(1))
            elif(data[j,curr_index]=="No"):
                data_converted.append(float(0))
            else:
                data_converted.append(float(data[j,curr_index]))
    
        features=np.concatenate((features,np.array(data_converted).reshape(data.shape[0],1)),axis=1)
        feature_names.append(headers[curr_index])
        curr_index=curr_index+1
    

features=features[:,1:]

#Convering into standard normal form

med_condition_norm=(med_condition-(np.average(med_condition,axis=0)))/np.std(med_condition,axis=0)
features_norm=(features-(np.average(features,axis=0)))/np.std(features,axis=0)



### Activity 1 : Finding Pearson Correlation Coefficients and p-values
#### Finding top 10 features that prevent and cause Cancer, Diabetes, Heart Disease Obesity
* I would evaluate Pearson Correlation Coefficient and p-values for the 4 medical conditions wrt all the food habits. 
* Corrlation co-efficients range from -1 to 1. O denotes no correlation and 1/-1 denotes full correlation, but with increase-with-increase or decrease-with-increase relation. 
* The p-values denotes the probabilty that the correlation is susceptible to error and a correlation was found erranouesly when Null Hypothesis ( no correlation ) was actually true.
* I would consider a correlation to be true if its p value is less than 0.1 ( correlation would be correct 90% of the time )
* Iterating over all features, a dictionary would contain the Pearson Correlation coefficient in sorted arrays for positive and negavtive values. Highest 5 correlation values and lowest 5 correlation values would be assume to cause and prevent that disease respectively.
* All columns would be converted to standard normal form for best correlation results.


In [106]:
#Calculating Pearson Correlations for the 4 medical conditions
from scipy.stats import pearsonr

best=10
corr_results={}

for i in range(med_condition_norm.shape[1]):
    corr_val_pos=[]
    corr_label_pos=[]
    corr_val_neg=[]
    corr_label_neg=[]
    
    for j in range(features_norm.shape[1]):
        c,p=pearsonr(med_condition_norm[:,i], features_norm[:,j])
        if(p<0.1):
            if(c<0):
                corr_val_neg.append(c)
                corr_label_neg.append(feature_names[j])
            else:
                corr_val_pos.append(c)
                corr_label_pos.append(feature_names[j])
    
    corr_results[med_condition_name[i]+'_val_pos']=(np.array(corr_val_pos)[np.argsort(corr_val_pos)])[-best:]
    corr_results[med_condition_name[i]+'_label_pos']=(np.array(corr_label_pos)[np.argsort(corr_val_pos)])[-best:]
    corr_results[med_condition_name[i]+'_size_pos']=[5,6,7,8,9,10,11,12,13,14]
    corr_results[med_condition_name[i]+'_val_neg']=(np.array(corr_val_neg)[np.argsort(corr_val_neg)])[0:best]
    corr_results[med_condition_name[i]+'_label_neg']=(np.array(corr_label_neg)[np.argsort(corr_val_neg)])[0:best]
    corr_results[med_condition_name[i]+'_size_neg']=[14,13,12,11,10,9,8,7,6,5]
        



In [108]:
#Plotting chart

source=ColumnDataSource(data=corr_results)

fig=figure(plot_width=700, plot_height=450,title='Pearson Correlations',y_range=["Cancer","Diabetes","Heart Disease","Obesity"])

fig.ray(x=[0], y=[0], length=5, angle=[90],
      angle_units="deg", color="#808B96", line_width=0.8)

c1=fig.circle(x='cancer_val_pos',y=1,size='cancer_size_pos', color="#CB4335",source=source)
fig.add_tools(HoverTool(tooltips=[
    ("Feature", "@cancer_label_pos"),
    ("Correlation", "@cancer_val_pos")
],renderers=[c1]))
c2=fig.circle(x='cancer_val_neg',y=1,size='cancer_size_neg', color="#EC7063",source=source)
fig.add_tools(HoverTool(tooltips=[
    ("Feature", "@cancer_label_neg"),
    ("Correlation", "@cancer_val_neg")
],renderers=[c2]))
c3=fig.circle(x='diabetes_val_pos',y=2,size='diabetes_size_pos', color="#2E86C1",source=source)
fig.add_tools(HoverTool(tooltips=[
    ("Feature", "@diabetes_label_pos"),
    ("Correlation", "@diabetes_val_pos")
],renderers=[c3]))
c4=fig.circle(x='diabetes_val_neg',y=2,size='diabetes_size_neg', color="#5DADE2",source=source)
fig.add_tools(HoverTool(tooltips=[
    ("Feature", "@diabetes_label_neg"),
    ("Correlation", "@diabetes_val_neg")
],renderers=[c4]))
c5=fig.circle(x='heart_disease_val_pos',y=3,size='heart_disease_size_pos', color="#28B463",source=source)
fig.add_tools(HoverTool(tooltips=[
    ("Feature", "@heart_disease_label_pos"),
    ("Correlation", "@heart_disease_val_pos")
],renderers=[c5]))
c6=fig.circle(x='heart_disease_val_neg',y=3,size='heart_disease_size_neg', color="#58D68D",source=source)
fig.add_tools(HoverTool(tooltips=[
    ("Feature", "@heart_disease_label_neg"),
    ("Correlation", "@heart_disease_val_neg")
],renderers=[c6]))
c7=fig.circle(x='belly_val_pos',y=4,size='belly_size_pos', color="#E67E22",source=source)
fig.add_tools(HoverTool(tooltips=[
    ("Feature", "@belly_label_pos"),
    ("Correlation", "@belly_val_pos")
],renderers=[c7]))
c8=fig.circle(x='belly_val_neg',y=4,size='belly_size_neg', color="#E59866",source=source)
fig.add_tools(HoverTool(tooltips=[
    ("Feature", "@belly_label_neg"),
    ("Correlation", "@belly_val_neg")
],renderers=[c8]))




#Axes and Labels
fig.xaxis.axis_label = 'Correlation Strength'

fig.axis.major_label_text_color = "#4B5C8A"
fig.ygrid.grid_line_alpha = 0.9
fig.ygrid.grid_line_dash = [5, 3]
fig.xgrid.grid_line_alpha = 0.2
fig.xgrid.grid_line_dash = [5, 3]


show(fig)



* Hovering on the above chart, we can deduce some strong positive and negative correlations:

* If you smoke rarely, use vegetable or any other cooking oil, eat low-fat cake, eat low-fat salad dressing or consume iced-tea even without sugar, you would get obese.
* If you drink a lot of Iced-Tea, use Canola cooking fat or consume grape-fruit juice, you have more chances to have a heart disease.
* If you use consume more sweets,sugar beverages and soda diet, you are more likely to have diabetes. This makes so much sense.

* If you consume a lot of vitamin E, Whole-grain cracker and carrots, you aren't likely to be obese.
* If you use Pamornone cooking fat you are less likely yo have a heart disease.
* If you consum more alcoholic beverages, you are less likely to be diabetic.
* If you eat less salt, corn chips, hamburgers, icecream, pancakes, you are less likely to get cancer.

* There are resuts that challenge intuition:

* People eating more pancakes, cookies, bagels and pastries are less likely to suffer from obesity.
* If you have a dog, you are more likely to suffer from a heart disease, whereas if you consume more sugar in coffee and peanut butter, you are less likely to suffer from a heart disease.
* If you eat more carrots,potatoes, greens and cottage cheese, you are more likely to have cancer.
* If you are a latino, you have less chances of getting cancer...


* Results do make sense in a few cases, especially in diabetes, but many results seem too absurd. Dogs giving you a heart disease, Latinos being less likely to have cancer. The reason might be the small sample data-set with a huge list of features. It's less likely to reach a robust correlation with a small data-set. Although the results may be analytically correct, but this is a bad example of analytics where absurd conclusions can be made.

* I will try Randon Forest Classifier to further analyse the data.


### Activity 2 : Applying Random Forest Classifier
* The dataset would be divided into two parts, train set (44 rows) and test set (10 rows).
* Random Forest classifier would be used to train on all features for train set one by one for cancer, diabetes,heart_disease and belly.
* Algorithm would provide feature importance scores to all features. I would use the top 10 non-zero feature importances, and plot them in order of their importance. These are our best order of features that affect the disease in question.
* Accuracy of the Random Forest classifier would also be calculated.

In [109]:
#Training Random Forest Classifiers for the 4 medical conditions and 
#finding feature importance and accuracy
from sklearn.ensemble import RandomForestClassifier
feature_train=features[0:45,:]
feaure_test=features[45:,:]
best=10
rand_forest_results={}
accuracy_list=[]
for i in range(4):
    med_condition_train=med_condition[0:45,i]
    med_condition_test=med_condition[45:,i]
    rf = RandomForestClassifier(max_depth=2, random_state=0)
    rf.fit(feature_train,med_condition_train)
    label=[]
    importance=[]
    for j in range(rf.n_features_):
        if(rf.feature_importances_[j]>0):
            label.append(feature_names[j])
            importance.append(rf.feature_importances_[j])
    
    rand_forest_results[med_condition_name[i]+'_val']=(np.array(importance)[np.argsort(importance)])[-best:]
    rand_forest_results[med_condition_name[i]+'_label']=(np.array(label)[np.argsort(importance)])[-best:]   
    rand_forest_results[med_condition_name[i]+'_size']=[10,11,12,13,14,15,16,17,18,19]
    med_condition_pred=rf.predict(feaure_test)
    
    correct=0
    accuracy=0
    for k in range(len(med_condition_pred)):
        if(med_condition_pred[k]==med_condition_test[k]):
            correct=correct+1
    
    accuracy=correct/len(med_condition_pred)*100
    accuracy_list.append(accuracy)
rand_forest_results['cancer_color']=["#F4ECF7", "#E8DAEF", "#D2B4DE", "#BB8FCE", "#A569BD", "#8E44AD", "#7D3C98", "#6C3483", "#5B2C6F", "#4A235A"]
rand_forest_results['diabetes_color']=["#E9F7EF", "#D5F5E3", "#ABEBC6", "#82E0AA", "#58D68D", "#2ECC71", "#28B463", "#239B56", "#1D8348", "#186A3B"]
rand_forest_results['heart_disease_color']=["#EBF5FB", "#D6EAF8", "#AED6F1", "#85C1E9", "#85C1E9", "#5DADE2", "#3498DB", "#2E86C1", "#2874A6", "#1B4F72"]
rand_forest_results['belly_color']=["#FDEDEC", "#FADBD8", "#F5B7B1", "#F1948A", "#EC7063", "#E74C3C", "#CB4335", "#B03A2E", "#943126", "#78281F"]

source_rf=ColumnDataSource(data=rand_forest_results)

In [110]:

fig2=figure(plot_width=700, plot_height=450,title='Random Forest Classifier Feature Importances',y_range=["Cancer","Diabetes","Heart Disease","Obesity"])

c1_rf=fig2.circle(x='cancer_val',y=1,size='cancer_size', color='cancer_color',source=source_rf)
fig2.add_tools(HoverTool(tooltips=[
    ("Feature", "@cancer_label"),
    ("Importance", "@cancer_val")
],renderers=[c1_rf]))

c2_rf=fig2.circle(x='diabetes_val',y=2,size='diabetes_size', color='diabetes_color',source=source_rf)
fig2.add_tools(HoverTool(tooltips=[
    ("Feature", "@diabetes_label"),
    ("Importance", "@diabetes_val")
],renderers=[c2_rf]))

c3_rf=fig2.circle(x='heart_disease_val',y=3,size='heart_disease_size', color='heart_disease_color',source=source_rf)
fig2.add_tools(HoverTool(tooltips=[
    ("Feature", "@heart_disease_label"),
    ("Importance", "@heart_disease_val")
],renderers=[c3_rf]))

c4_rf=fig2.circle(x='belly_val',y=4,size='belly_size', color='belly_color',source=source_rf)
fig2.add_tools(HoverTool(tooltips=[
    ("Feature", "@belly_label"),
    ("Importance", "@belly_val")
],renderers=[c4_rf]))




#Axes and Labels
fig2.xaxis.axis_label = 'Feature Importance'

fig2.axis.major_label_text_color = "#4B5C8A"
fig2.ygrid.grid_line_alpha = 0.9
fig2.ygrid.grid_line_dash = [5, 3]
fig2.xgrid.grid_line_alpha = 0.2
fig2.xgrid.grid_line_dash = [5, 3]


show(fig2)









* In the above plot, the top 10 important features that affect the disease have been plotted
* Increasing size,and deeper color signifies more importance. Hovering over the circles gives the name of the feature and it's importance.
* The Random Forest Classifier accuracy has been 33.3% on Obesity, 66.6% on Heart Disease, 55.6% on Diabetes and 100% on cancer.

* Having more Job Stand Time, consuming more popcorn and cream in cofee has high importance for cancer classifier.
* Consuming more soda diet and apples is likely to be a sign of diabetes.
* Drinking more iced-tea can give you heart disease.

* The results are somewhat similar to the ones predicted by Parsons Correlation, but are equally vague sometimes.
* I conclude by saying that having a small sample data does not result in good analytics. Results that can be mathematically correct, can lead to formation of false misconceptions and hoaxes, just because the sample set used was either too small or too skewed.
