In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model 
import numpy as np
from pyspark.sql import functions as fn
from pyspark.ml import Pipeline
from pyspark.ml import regression
from pyspark.ml import feature
from pyspark.sql import SQLContext
import pyspark.ml.tuning as tune
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
%sh wget https://www.dropbox.com/s/9xf7gwlal3dj4bv/HEALTHRESPONACTISUMMARYMERGED.csv?dl=1 -O combinedHealth.csv

In [3]:
healthCombined = pd.read_csv('combinedHealth.csv')
healthCombined.head()

In [4]:
list(healthCombined)

In [5]:
healthCombined.ERBMI.value_counts(dropna = False), healthCombined.ERBMI.count()

In [6]:
np.float(healthCombined.loc[healthCombined.ERBMI.isin([-1])].ERBMI.count())/np.float(healthCombined.ERBMI.count())*100

Less than 5 percent of -1(null) present, so removing those values

In [8]:
healthCombinedEdited = healthCombined.loc[healthCombined['ERBMI'] != -1]

In [9]:
healthCombined.EUPRPMEL.value_counts(dropna = False), healthCombined.EUPRPMEL.count()
#healthCombinedEdited.loc[healthCombinedEdited.tewhere == -3].tewhere

Remove all nulls without replacement first. See the count of resulting output and decide for a different approach if the count is too less

In [11]:
healthCombinedEdited = healthCombinedEdited.loc[healthCombinedEdited['EUDIETSODA'].isin([1,2,3])]
healthCombinedEdited = healthCombinedEdited.loc[healthCombinedEdited['EUEXERCISE'].isin([1,2,])]
healthCombinedEdited = healthCombinedEdited.loc[healthCombinedEdited['EEINCOME1'].isin([1,2,3])]
healthCombinedEdited = healthCombinedEdited.loc[~healthCombinedEdited['EUEXFREQ'].isin([-1])]
healthCombinedEdited = healthCombinedEdited.loc[~healthCombinedEdited['EUFASTFD'].isin([-2, -3])]
healthCombinedEdited = healthCombinedEdited.loc[~healthCombinedEdited['EUFDSIT'].isin([-2])]
healthCombinedEdited = healthCombinedEdited.loc[~healthCombinedEdited['EUGENHTH'].isin([-2])]
healthCombinedEdited = healthCombinedEdited.loc[~healthCombinedEdited['EUMEAT'].isin([-1,-2])]
healthCombinedEdited = healthCombinedEdited.loc[~healthCombinedEdited['tewhere'].isin([-1])]

#'TRERNWA', 'EUFASTFDFRQ','EUMILK','EUEATSUM',
healthCombinedCleaned = healthCombinedEdited[['ERBMI', 'ERTPREAT', 'ERTSEAT', 'EUDIETSODA',  'EUEXERCISE', 'TEAGE',  'EEINCOME1', 'EUEXFREQ', 'EUFASTFD',  'EUFFYDAY', 'EUFDSIT', 'EUGENHTH'
                                             , 'EUGROSHP', 'EUMEAT',  'EUPRPMEL', 'TUACTIVITY_N',  'tuactdur24', 'tewhere', 'TESEX']]
healthCombinedCleaned.info()

Enough data after removing null to continue with analysis

In [13]:
healthCombinedCleaned.describe()

Visualize to check if scaling required

In [15]:
plt.hist(healthCombinedCleaned.ERTPREAT)
display()

In [16]:
sns.jointplot( x = 'ERTPREAT', y = 'ERBMI', data = healthCombinedCleaned)
display()

In [17]:
plt.figure()
pair = sns.pairplot(healthCombinedCleaned[['ERBMI', 'ERTPREAT', 'ERTSEAT', 'EUDIETSODA',  'EUEXERCISE', 'TEAGE',  'EEINCOME1', 'EUEXFREQ', 'EUFASTFD',  'EUFFYDAY', 'EUFDSIT', 'EUGENHTH'
                                             , 'EUGROSHP', 'EUMEAT',  'EUPRPMEL', 'TUACTIVITY_N',  'tuactdur24', 'tewhere', 'TESEX']])
display()

Convert pandas Dataframe to spark DataFrame for futher analysis

In [19]:
sqlContext = SQLContext(sc)

In [20]:
health_df = sqlContext.createDataFrame(healthCombinedCleaned)
display(health_df)

Split data in train and test for modeling. 70% train, 30% test

In [22]:
training, test = health_df.randomSplit([0.7,0.3],0)

In [23]:
training.count()

In [24]:
test.count()

2 Vector Assembler Pipeline stages   
1 - With all features   
2 - With just the intercept

In [26]:
vecScaled = feature.VectorAssembler(inputCols = [ 'ERTPREAT', 'ERTSEAT', 'EUDIETSODA',  'EUEXERCISE', 'TEAGE',  'EEINCOME1', 'EUEXFREQ', 'EUFASTFD',  'EUFFYDAY', 'EUFDSIT', 'EUGENHTH'
                                             , 'EUGROSHP', 'EUMEAT',  'EUPRPMEL', 'TUACTIVITY_N',  'tuactdur24', 'tewhere', 'TESEX'], outputCol = 'features')

In [27]:
vecIntercept = feature.VectorAssembler(inputCols=[], outputCol='emptyFeatures')

Scaling stage to scale features from Vector Assembler

In [29]:
scaled = feature.StandardScaler(inputCol='features', outputCol='sclaedFeatures')

Three Linear Regression Pipleline stage   
1 - LR with just the intercept   
2 - LR with all features unscaled   
3 - LR with all features and scaled stage

In [31]:
regIntercept = regression.LinearRegression(labelCol= 'ERBMI', featuresCol= 'emptyFeatures')

In [32]:
regUnscaled = regression.LinearRegression(labelCol = 'ERBMI', featuresCol = 'features', regParam=0, elasticNetParam = 0)

In [33]:
regScaled = regression.LinearRegression(labelCol = 'ERBMI', featuresCol = 'sclaedFeatures', maxIter=5)

3 Piplelines for the different Linear Regression

In [35]:
pipeIntercept = Pipeline(stages = [vecIntercept, regIntercept])

In [36]:
PipeUnscaled = Pipeline(stages = [vecScaled, regUnscaled])

In [37]:
PipeScaled = Pipeline(stages = [vecScaled, scaled, regScaled])

RMSE declartion for measuring model accuracy

In [39]:
rmse = fn.sqrt(fn.avg((fn.col('ERBMI') - fn.col('prediction'))**2))

Starting with the Intercept model analysis

In [41]:
interceptModel = pipeIntercept.fit(training)

In [42]:
rmse = fn.sqrt(fn.avg((fn.col('ERBMI') - fn.col('prediction'))**2))

In [43]:
interceptModel.transform(test).select(rmse).show()

RMSE of 5.71 with just the intercept   
Trying to LR with unscaled features

In [45]:
PipeUnscaled = Pipeline(stages = [vecScaled, regUnscaled])

In [46]:
unScaledModel = PipeUnscaled.fit(training)

In [47]:
unScaledModel.transform(test).select(rmse).show()

Observed a reduced RMSE with the features   
Now, scaling the features before fitting to the model

In [49]:
PipeScaled = Pipeline(stages = [vecScaled, scaled, regScaled])

In [50]:
scaledModel = PipeScaled.fit(training)

In [51]:
scaledModel.transform(test).select(rmse).show()

Scaled features resulted with a very small increase in the intercept

Analysing coefficeints of scaled and unscaled model

In [54]:
linModelUnscaled = unScaledModel.stages[-1]

In [55]:
linModelUnscaled.coefficients

In [56]:
linModelScaled = scaledModel.stages[-1]

In [57]:
linModelScaled.coefficients

Adding values to DataFrame for plotting the coefficients

In [59]:
valuesDF = pd.DataFrame( healthCombinedCleaned.drop('ERBMI', axis = 1).columns)

In [60]:
valuesDF['NotScaled'] = linModelUnscaled.coefficients

In [61]:
valuesDF['Scaled'] = linModelScaled.coefficients

In [62]:
valuesDF

In [63]:
valuesDF.columns = ['feature', 'notScaled', 'Scaled']
valuesDF.columns

Plotting scaled model with seaborn

In [65]:
plt.figure()
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.barplot( y = 'Scaled', x = 'feature', data = valuesDF)
plt.xticks(rotation = 60)
display()

Plotting unscaled model with seaborn

In [67]:
plt.figure()
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.barplot( y = 'notScaled', x = 'feature', data = valuesDF)
plt.xticks(rotation = 60)
display()

Plotting scaled and unsacled for comparison

In [69]:
indexDf = valuesDF.set_index('feature')
indexDf

In [70]:
plt.figure()
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111) 
ax2 = ax.twinx() 
width = 0.4
indexDf.notScaled.plot(kind='bar', color='red', ax=ax, width=width, position=0, legend = True)
indexDf.Scaled.plot(kind='bar', color='blue', ax=ax, width=width, position=1, legend = True)
display()

Applying regularization to further improve our RMSE   
First step to build a grid of two parameters - ElasticNetRegularization

In [72]:
grid = tune.ParamGridBuilder()

In [73]:
grid = grid.addGrid(regScaled.elasticNetParam, [0, 0.2, 0.4, 0.6, 0.8, 1])

In [74]:
grid = grid.addGrid(regScaled.regParam, np.arange(0,.1,.01))

In [75]:
grid = grid.build()

Defining evalutor for Cross Validation

In [77]:
evaluator = RegressionEvaluator(labelCol=regScaled.getLabelCol(), predictionCol=regScaled.getPredictionCol())

Import cross validator, cross validator model and rand to build a custome function CrossValidatorVerbose on top of CrossValidator

In [79]:
from pyspark.sql.functions import rand
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel

In [80]:
a = list()
b = list()
c = list()

CrossValidatorVerbose builds on top of Cross Validator by displaying and storing out from each fold and all the regularization parameters   
These values then can be used to compare the models with different regularizations

In [82]:
class CrossValidatorVerbose(CrossValidator):
    
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        metricName = eva.getMetricName()

        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds

        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels

        for i in range(nFolds):
            foldNum = i + 1
            print("Comparing models on fold %d" % foldNum)

            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
            validation = df.filter(condition)
            train = df.filter(~condition)

            for j in range(numModels):
                paramMap = epm[j]
                model = est.fit(train, paramMap)
                # TODO: duplicate evaluator to take extra params from input
                metric = eva.evaluate(model.transform(validation, paramMap))
                metrics[j] += metric

                avgSoFar = metrics[j] / foldNum
                print("params: %s\t%s: %f\tavg: %f" % (  
                  {param.name: val for (param, val) in paramMap.items()},
                    metricName, metric, avgSoFar))

                for (param, val) in paramMap.items():
                  a.append(param.name)
                  b.append(val)
                  c.append(metric)
                
        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)

        bestParams = epm[bestIndex]
        bestModel = est.fit(dataset, bestParams)
        avgMetrics = [m / nFolds for m in metrics]
        bestAvg = avgMetrics[bestIndex]
        print("Best model:\nparams: %s\t%s: %f" % (
            {param.name: val for (param, val) in bestParams.items()},
            metricName, bestAvg))
        
        return self._copyValues(CrossValidatorModel(bestModel, avgMetrics))

Passing the pipeline, grid of HyperParameters, and evaluator to Cross Validation with three folds

In [84]:
cvVer = CrossValidatorVerbose(estimator = PipeScaled, estimatorParamMaps = grid, evaluator= evaluator, numFolds = 3)

In [85]:
varStore = cvVer.fit(training)

In [86]:
testStore = varStore.transform(test)

In [87]:
testStore.select(rmse).show()

RMSE reduced by a very low value.   
Analyzing models from cross validation

Extract all models from Cross Validation

In [90]:
linearDict = {}
for i in range(0, len(a), 2):
  linearDict[a[i] + " " + `b[i]` + " " + a[i + 1] + " " + `b[i + 1]`] = c[i]

Sort the models with lowest rmse first

In [92]:
for key, value in sorted(linearDict.iteritems(), key=lambda (k,v): (v,k)):
    print "%s: %s" % (key, value)

Extract the Best Model from cross validation

In [94]:
BestModel = varStore.bestModel.stages[-1]
BestModel

In [95]:
BestModel.coefficients

Add Coefficients from best model to the DataFrame for plotting

In [97]:
indexDf['BestScaled'] = BestModel.coefficients

In [98]:
valuesDF['BestScaled'] = BestModel.coefficients

Comparing coefficients of Best Model vs Model with regularization

In [100]:
plt.figure()
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')

ax = fig.add_subplot(111) 
ax2 = ax.twinx() 

width = 0.4
indexDf.BestScaled.plot(kind='bar', color='red', ax=ax, width=width, position=0, legend = True)
indexDf.Scaled.plot(kind='bar', color='blue', ax=ax, width=width, position=1, legend = True)

display()

Coefficients of best model

In [102]:
plt.figure()
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.barplot( y = 'BestScaled', x = 'feature', data = valuesDF)
plt.xticks(rotation = 60)
display()

In [103]:
BestModel.intercept

In [104]:
i = 0
top5SortedDict = {}
for key, value in sorted(linearDict.iteritems(), key=lambda (k,v): (v,k)):
  if(i<3):
    top5SortedDict[key] = value
  i = i + 1
top5SortedDict['Intercept'] = evaluator.evaluate(interceptModel.transform(test))
top5SortedDict['UnscaledModel'] = evaluator.evaluate(unScaledModel.transform(test))
top5SortedDict

In [105]:
cfplt.figure()
fig=plt.figure(figsize=(20, 9), dpi= 80, facecolor='w', edgecolor='k')
plt.bar(range(len(top5SortedDict)), list(top5SortedDict.values()), align='center')
plt.xticks(range(len(top5SortedDict)), list(top5SortedDict.keys()))
plt.xticks(rotation = 60)
plt.ylabel("RMSE")
plt.ylim([min(top5SortedDict.values()) - 0.2 , max(top5SortedDict.values()) + 0.2])
display()

Running Regression with Random Forest Regression

Pipleline stage for Random forest regression with scaled features

In [108]:
rfRegression = regression.RandomForestRegressor(featuresCol='sclaedFeatures', labelCol='ERBMI')

Building grid with hyper-parameters for Randorm forest.   
Grid for chossing the number of trees and the maximum depth of each tree

In [110]:
gridRandom = tune.ParamGridBuilder()

In [111]:
gridRandom = gridRandom.addGrid(rfRegression.numTrees, [2,4,5,8])

In [112]:
gridRandom = gridRandom.addGrid(rfRegression.maxDepth, [2,3,4,5,6])

In [113]:
gridRandom = gridRandom.build()

In [114]:
pipeRandom = Pipeline(stages = [vecScaled, scaled, rfRegression])

In [115]:
randomModel = pipeRandom.fit(training)

In [116]:
randomTested = randomModel.transform(test)

Defining evaluator for Random Forest

In [118]:
evaluatorRandom = RegressionEvaluator(labelCol= rfRegression.getLabelCol() , predictionCol= rfRegression.getPredictionCol())

In [119]:
evaluatorRandom.evaluate(randomTested)

A much reduced rmse with Random Forest Regression

Extracting feature impoartances for the variables passed on

In [122]:
randomStage = randomModel.stages[-1]

In [123]:
randomStage.featureImportances

Adding feature importances to DataFrame for plotting

In [125]:
valuesDF['randomForestFeatures'] = randomStage.featureImportances

In [126]:
indexDf = valuesDF.set_index('feature')
indexDf

IS THERE A POINT TO THIS????

In [128]:
plt.figure()
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')

ax = fig.add_subplot(111) 
ax2 = ax.twinx() 

width = 0.4
indexDf.abs().Scaled.plot(kind='bar', color='red', ax=ax, width=width, position=0, legend = True)
indexDf.randomForestFeatures.plot(kind='bar', color='blue', ax=ax, width=width, position=1, legend = True)

display()

Passing Random forest through cross validation with 4 folds

In [130]:
cvVdVerboseRandom = CrossValidatorVerbose(estimator=pipeRandom, estimatorParamMaps=gridRandom, evaluator=evaluatorRandom, numFolds=4)

In [131]:
a = list()
b = list()
c = list()

In [132]:
randomFit = cvVdVerboseRandom.fit(training)

In [133]:
randomDict = {}
for i in range(0, len(a), 2):
  randomDict[a[i] + " " + `b[i]` + " " + a[i + 1] + " " + `b[i + 1]`] = c[i]

In [134]:
for key, value in sorted(randomDict.iteritems(), key=lambda (k,v): (v,k)):
    print "%s: %s" % (key, value)

In [135]:
randomTransform = randomFit.transform(test)

In [136]:
randomForestBestModel = randomFit.bestModel

Evalating best Random forest results in a lower rmse

After hyper parametrizing we observe further reduction in the RMSE to 4.46

In [139]:
evaluatorRandom.evaluate(randomFit.bestModel.transform(test))

In [140]:
randomForestRegressionModel = randomForestBestModel.stages[-1]

In [141]:
bestModelFeatures = randomForestRegressionModel.featureImportances

In [142]:
valuesDF['randomForestBestFeatures'] = randomForestRegressionModel.featureImportances

In [143]:
indexDf = valuesDF.set_index('feature')
indexDf

In [144]:
plt.figure()
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')

ax = fig.add_subplot(111) 
ax2 = ax.twinx() 

width = 0.4
indexDf.randomForestBestFeatures.plot(kind='bar', color='red', ax=ax, width=width, position=0, legend = True)
indexDf.randomForestFeatures.plot(kind='bar', color='blue', ax=ax, width=width, position=1, legend = True)

display()

In [145]:
plt.figure()
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.barplot( y = 'randomForestBestFeatures', x = 'feature', data = valuesDF)
plt.xticks(rotation = 60)
display()

In [146]:
i = 0
top5SortedDict = {}
for key, value in sorted(randomDict.iteritems(), key=lambda (k,v): (v,k)):
  if(i<5):
    top5SortedDict[key] = value
  i = i + 1
top5SortedDict

In [147]:
plt.figure()
fig=plt.figure(figsize=(20, 9), dpi= 80, facecolor='w', edgecolor='k')
plt.bar(range(len(top5SortedDict)), list(top5SortedDict.values()), align='center')
plt.xticks(range(len(top5SortedDict)), list(top5SortedDict.keys()))
plt.xticks(rotation = 60)
plt.ylim([min(top5SortedDict.values()) - 0.2 , max(top5SortedDict.values()) + 0.2])
display()

In [148]:
evaluatorRandom.evaluate(randomFit.bestModel.transform(test))/min(randomFit.avgMetrics)

Plotting top 5 Randform forest model

In [150]:
dfa.plot.scatter('prediction', 're')
display() 

In [151]:
healthCombinedCleaned = healthCombinedEdited.loc[~healthCombinedEdited['TRERNWA'].isin([-1])]
healthCombinedCleaned.TRERNWA.value_counts(dropna = False)

In [152]:
X_train, X_test, y_train, y_test = train_test_split(healthCombinedCleaned.drop('ERBMI', axis = 1), healthCombinedCleaned.ERBMI, test_size = 0.3, random_state = 21 )

In [153]:
reg = linear_model.LinearRegression()

In [154]:
reg.fit(X_train, y_train)

In [155]:
reg.score(X_test, y_test)

In [156]:
healthCombinedEdited.head()

In [157]:
healthCombinedEdited.ERINCOME.value_counts()

In [158]:
healthCombinedEdited[healthCombinedEdited.ERINCOME]

In [159]:
mostEating = healthCombinedEdited.ERTSEAT.value_counts(dropna = False)[0:25].index.tolist()
tryingEating = healthCombinedEdited.loc[healthCombinedEdited['ERTSEAT'].isin(mostEating)]

In [160]:
healthCombinedEdited.loc[healthCombinedEdited['ERTSEAT'].isin(mostEating)].ERTSEAT.value_counts(dropna = False)

In [161]:
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.regplot( x = tryingEating["ERTSEAT"], y = tryingEating["ERBMI"], fit_reg = False)
display()

In [162]:
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot( x = "ERTSEAT", y = "ERBMI", data = tryingEating)
display()

In [163]:
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
tryingExer = healthCombinedEdited.loc[healthCombinedEdited['EUEXERCISE'].isin([1,2])]
sns.boxplot( x = "EUEXERCISE", y = "ERBMI", data = tryingExer)
display()

In [164]:
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot( x = "TESEX", y = "ERBMI", data = healthCombinedEdited)
display()

In [165]:
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot(x = 'EEINCOME1', y =  "ERBMI", data = healthCombinedEdited.loc[healthCombinedEdited['EEINCOME1'].isin([1,2, 3])])
display()

In [166]:
healthCombinedEdited.ERTPREAT.max()

In [167]:
labels = ['Very High', 'High', 'Med', 'Low', 'Very Low']
labels

In [168]:
healthCombinedEdited['TimeSecondaryEating'] = pd.cut(healthCombinedEdited.ERTPREAT, 5, right = False, labels = labels)

In [169]:
healthCombinedEdited['TimeSecondaryEating'].value_counts()

In [170]:
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot(x = 'TimeSecondaryEating', y =  "ERBMI", data = healthCombinedEdited)
display()

In [171]:
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot(x = 'EUDIETSODA', y =  "ERBMI", data = healthCombinedEdited.loc[healthCombinedEdited['EUDIETSODA'].isin([1,2, 3])])
display()

In [172]:
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot(x = 'EUDRINK', y =  "ERBMI", data = healthCombinedEdited.loc[healthCombinedEdited['EUDRINK'].isin([1,2])])
display()

In [173]:
healthCombinedEdited.EUFASTFD.value_counts()
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot(x = 'EUFASTFD', y =  "ERBMI", data = healthCombinedEdited.loc[healthCombinedEdited['EUFASTFD'].isin([1,2])])
display()

In [174]:
healthCombinedEdited.EUFASTFDFRQ.value_counts()
healthCombinedEdited['FastFoodFrequrency'] = pd.cut(healthCombinedEdited.EUFASTFDFRQ, 5, right = False, labels = labels)
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot(x = 'FastFoodFrequrency', y =  "ERBMI", data = healthCombinedEdited)
display()

In [175]:
healthCombinedEdited.EUGENHTH.value_counts(dropna = False)
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot(x = 'EUGENHTH', y =  "ERBMI", data = healthCombinedEdited.loc[healthCombinedEdited['EUGENHTH'].isin([1,2,3,4,5])])
display()

In [176]:
healthCombinedEdited.TESCHENR.value_counts(dropna = False)
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot(x = 'TESCHENR', y =  "ERBMI", data = healthCombinedEdited.loc[healthCombinedEdited['TESCHENR'].isin([1,2])])
display()

In [177]:
healthCombinedEdited.EUFASTFDFRQ.value_counts()
healthCombinedEdited['FastFoodFrequrency'] = pd.cut(healthCombinedEdited.EUFASTFDFRQ, 5, right = False, labels = labels)
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot(x = 'FastFoodFrequrency', y =  "ERBMI", data = healthCombinedEdited)
display()

In [178]:
tempDF = healthCombinedEdited.loc[healthCombinedEdited['TRERNWA'] != -1]
tempDF['Earnings'] = pd.cut(tempDF.TRERNWA, 5, right = False, labels = labels)
fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
sns.boxplot(x = 'Earnings', y =  "ERBMI", data = tempDF)
display()