### Import and Configuration:

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.decomposition import PCA
from IPython.core.display import display, HTML

import seaborn as sns
from sklearn.cluster import KMeans
from IPython.html.widgets import interact

display(HTML("""
<style>
.output_png{
    display: table-cell;
    text-align: left;
    vertical-align: middle;
}
.container{width:90% !important;}
</style>"""))

random.seed(149)

In [None]:
def loaddata():
    return pd.read_csv('food_coded.csv')
data = loaddata()

## Preprocessing:

#### Employment:

In [None]:
# employment, sample from prob dist and fill the missing values
probibility = data['employment'].value_counts() / data['employment'].value_counts().sum().tolist()
count_of_nas = pd.isna(data['employment']).sum()
values = probibility.cumsum().searchsorted(np.random.uniform(size=count_of_nas))
for i in xrange(len(values)):
    if values[i] == 0: values[i] = 2
    elif values[i] == 1: values[i] = 3
    else: values[i] = 1
index = 0
for i in xrange(len(data.employment)):
    if pd.isnull(data.iloc[i].employment):
        data.loc[i, 'employment'] = values[index]
        index += 1

#### Sports:

We took a look on the students' answers and compiled a list of their sports, then replaced their text answers with a comma seperated list of the sports they played. Morever, we replaced any other answers like "I don't play sports" and "not really" with an NA

In [None]:
sports = ['car racing', 'basketball', 'softball', 'soccer', 'field hockey', 'running', 'volleyball', 'hockey', \
          'dancing', 'tennis', 'gym', 'football', 'lacrosse', 'snowboarding', 'wrestling', 'rowing', 'skiing', \
          'water polo', 'baseball', 'horse back riding', 'golf']
sports.sort(key=len, reverse=True)

def func(x):
    output = ''
    for sport in sports:
        if pd.isnull(x): continue
        x = x.lower()
        if x.find(sport) != -1:
            output += sport + ','
            x.replace(sport, '')
    if len(output) == 0: return 'None'
    return output[:-1]
data.type_sports = data.type_sports.apply(func=func)

In [None]:
column_names = ['GPA',
 'Gender',
 'breakfast',
 'calories_chicken',
 'calories_scone',
 'coffee',
 'cook',
 'comfort_food_reasons_coded',
 'cuisine',
 'diet_current_coded',
 'drink',
 'eating_changes_coded',
 'eating_changes_coded1',
 'eating_out',
 'employment',
 'ethnic_food',
 'exercise',
 'father_education',
 'fav_cuisine_coded',
 'fav_food',
 'fries',
 'fruit_day',
 'grade_level',
 'greek_food',
 'healthy_feeling',
 'ideal_diet_coded',
 'income',
 'indian_food',
 'italian_food',
 'life_rewarding',
 'marital_status',
 'mother_education',
 'nutritional_check',
 'on_off_campus',
 'parents_cook',
 'pay_meal_out',
 'persian_food',
 'self_perception_weight',
 'soup',
 'sports',
 'thai_food',
 'tortilla_calories',
 'turkey_calories',
 'veggies_day',
 'waffle_calories',
 'weight']

In [None]:
weight_mean = np.nanmean(data['weight'])
data.weight = data.weight.fillna(weight_mean)
for col in data.columns:
    data.loc[:,col] = data[col].fillna(data[col].mode()[0])

## Plots:
### First: descriptive statistics:

In [None]:
def bar_chart(attribute, ticks_values,label):
    plt.clf()
    plt.rcParams['figure.figsize'] = (7,7)
    y = data[attribute].value_counts()
    N = len(y)
    x = range(N)
    width = 0.5
    plt.bar(x, y, width, color=sns.color_palette("Pastel1"))
    plt.xlabel(label, fontsize=16)
    plt.ylabel('Count', fontsize=16)
    plt.tick_params(axis='both',which='major', labelsize=15)
    index = ind = np.arange(len(x)) 
    
    plt.xticks(index, ticks_values)

    plt.tight_layout()
    plt.show()

def piechart():
    labels = ['Female', 'Male']
    sizes = [sum(data.Gender == 1), sum(data.Gender == 2)]
    explode = [0,0]
    plt.pie(sizes, labels=labels,colors = sns.color_palette("Pastel1"), autopct = '%1.1f%%')
    #plt.axis('equal')
    plt.show()

piechart()
bar_chart('grade_level',('freshman' ,'Sophomore' ,'Junior' ,'Senior'),'Grade level')
bar_chart('diet_current_coded',('healthy' ,'unhelathy' ,'same thing' ,'unclear'),'current diet')
bar_chart('eating_changes_coded',('worse' ,'better' ,'same' ,'unclear'),'eating changes coded')

In [None]:
def word_cloud(attribute,title):
    wordcloud = WordCloud(background_color='white',colormap='Dark2').generate(str(data[attribute]))
    plt.rcParams['figure.figsize'] = (12,10)
    plt.imshow(wordcloud)
    plt.title(title, fontsize=20)
    plt.box(on=True)
    plt.axis('off')
    plt.show()
word_cloud('comfort_food','Comfort Food')
word_cloud('comfort_food_reasons', 'Comfort Food Reasons')
word_cloud('food_childhood', 'Favorite food in childhood')
word_cloud('type_sports', 'Type of sport practiced')

### Second: relation between attributes:

In [None]:
def boxplot(attribute):
    plt.figure(dpi=100, figsize=(10,10))
    plt.rc('axes', axisbelow=True)
    plt.boxplot([list(data.query('Gender==1')[attribute].dropna()), list(data.query('Gender==2')[attribute].dropna())], labels=["Male", "Female"])
    plt.rc('axes', axisbelow=True)
    plt.xlabel(attribute)
    plt.show()

def bubble_chart(x_attribute, x_ticks, y_attribute, y_ticks, size):
    plt.clf()
    plt.rcParams['figure.figsize'] = (25,15)
    fig, ax = plt.subplots()
    
    x_axis = data[x_attribute].round(decimals=1).dropna().unique()
    for mark in x_axis:
        y_axis =  data.query('%s == %f' % (x_attribute, mark))[y_attribute].dropna().unique()
        y_axis.sort()
        s = [(size * 20 * len(data.query('%s == %f & %s == %f' % (x_attribute, mark, y_attribute, i) ))) ** 2 for i in y_axis]
        x = [mark]*len(y_axis)
        plt.scatter(x, y_axis, s = s, color="#00ace6", alpha=0.6,edgecolors="black", linewidth=2)

    if len(y_ticks) == 0:
        y_ticks = data[y_attribute].round(decimals=1).dropna().unique()
    plt.yticks(range(len(y_ticks)), y_ticks)

    plt.tick_params
    if len(x_ticks) != 0:
        ax.set_xticklabels(x_ticks)
    plt.ylabel(y_attribute, fontsize= 16)
    plt.xlabel(x_attribute, fontsize= 16)
    plt.tick_params(axis='both', which='major', labelsize=15)
    plt.margins(0.1)
    plt.show()
    
def grouped_barchart(x_attribute, x_labels, y_attribute, legend, legend_title, plot_title=''):
    plt.clf()
    plt.rcParams['figure.figsize'] = (25,10)
    fig, ax = plt.subplots()

    x_axis_unique = data[x_attribute].round(decimals=1).dropna().unique()
    x_axis_unique.sort()

    y_axis_unique = data[y_attribute].round(decimals=1).dropna().unique()
    y_axis_unique.sort()

    ind = np.arange(len(x_axis_unique))    # the x locations for the groups
    width = 0.15                           # the width of the bars
    likert_colors = sns.color_palette("Pastel1")
    for i in xrange(len(y_axis_unique)):
        y = y_axis_unique[i]
        counts = []
        for x in x_axis_unique:
            counts += [len(data.query('%s == %f & %s == %f' % (x_attribute, x, y_attribute, y)))]
        plt.bar(ind + i*width, counts, width, bottom=0, color=likert_colors[i])
    
    if plot_title == '': plot_title = ('%s vs %s' % (x_attribute.replace('_', ' '), y_attribute.replace('_', ' ')))
    ax.set_title(plot_title,  fontsize=20)
    ax.set_xticks(ind + 1*width / 2)
    ax.set_xticklabels(x_labels)

    plt.xlabel(x_attribute, fontsize=20)
    plt.ylabel('Count', fontsize=20)
    plt.tick_params(axis='both', which='major', labelsize=15)
    plt_legend = plt.legend(legend, title=legend_title, fontsize=15)
    plt_legend.get_title().set_fontsize('15')
    plt.show()
    
def horizontal_barchart(y_attribute, y_ticks, x_attribute, legend=[], legend_title='', plot_title=''):
    plt.clf()
    plt.rcParams['figure.figsize'] = (25,10)

    x_axis_unique = data[x_attribute].round(decimals=1).dropna().unique()
    x_axis_unique.sort()

    y_axis_unique = data[y_attribute].round(decimals=1).dropna().unique()
    y_axis_unique.sort()

    ind = np.arange(len(x_axis_unique))    # the x locations for the groups
    likert_colors = sns.color_palette('Pastel1')
    all_counts1 = []
    all_counts2 = []
    for i in xrange(len(y_axis_unique)):
        y = y_axis_unique[i]
        counts1 = []
        counts2 = []
        counts1 += [len(data.query('%s == %f & %s == %f & Gender == 1' % (x_attribute, x, y_attribute, y))) for x in x_axis_unique]
        counts2 += [-len(data.query('%s == %f & %s == %f & Gender == 2' % (x_attribute, x, y_attribute, y))) for x in x_axis_unique]
        all_counts1 += [counts1]
        all_counts2 += [counts2]

    xvalues = range(-30, 30, 5)
    male_data = pd.DataFrame(all_counts1,
                    columns=x_axis_unique,
                    index=y_axis_unique)
    ax = male_data.plot.barh(color=likert_colors,  edgecolor='none', legend=False)
    
    female_data = pd.DataFrame(all_counts2,
                    columns=x_axis_unique,
                    index=y_axis_unique)
    female_data.plot.barh(color= likert_colors, edgecolor='none', ax = ax)
    
    plt.legend(legend)
    plt.yticks(range(len(y_ticks)), y_ticks)
    plt.tick_params(axis='both', which='major', labelsize=15)

    plt_legend = plt.legend(legend, title=legend_title, fontsize=15)
    plt_legend.get_title().set_fontsize('15')
    plt.axvline(x=0, color='black', alpha=0.6, linestyle='-')
    plt.show()

In [None]:
grouped_barchart('eating_out', ['Never', '1-2 times', '2-3 times', '3-5 times', 'everyday'],
                 'nutritional_check', ['Never', 'on certain products only', 'very rarely', 'on most products', 'on everything'], 'How Often do you Check Nutrition')

grouped_barchart('eating_out', ['Never', '1-2 times', '2-3 times', '3-5 times', 'everyday'], 
                 'Gender', ['female','male'], 'Eating out for Males & Females')
# grouped_barchart('eating_out', ['Never', '1-2 times', '2-3 times', '3-5 times', 'everyday'], 'on_off_campus', ['On campus', 'rent out of campus', 'with parents', 'own house'], 'Accommodation', 'Students accommodation and eating out habbits')

grouped_barchart('diet_current_coded',['healthy' ,'unhelathy' ,'same thing' ,'unclear'],
                 'eating_changes_coded',['worse' ,'better' ,'same' ,'unclear'], 'Current Diet ')

bubble_chart('weight', [], 'fav_cuisine_coded',
             ["none", "Italian/French/greek", "Spanish/mexican", "Arabic/Turkish", "asian/chineses/thai/nepal", "American", "African", "Jamaican", "indian"] , 0.9)

### TODO: ADD comments

In [None]:
boxplot('weight')

### TODO: Comments

In [None]:
horizontal_barchart('exercise', ['Daily', 'once a week', 'twice a week'], 'eating_out', 
                    legend = ['Never', '1-2 times', '2-3 times', '3-5 times', 'everyday'])

#Female = 1 , Male = 2






### TODO: ADD comments

In [None]:
horizontal_barchart('exercise', ['Daily', 'once a week', 'twice a week'], 'eating_out', legend = ['Never', '1-2 times', '2-3 times', '3-5 times', 'everyday'])

### TODO: ADD comments

## PCA:

In [None]:
clustering_data = pd.DataFrame(data.loc[:, column_names])
pca =  PCA(n_components=2)
fit = pca.fit(clustering_data).transform(clustering_data)
matrix = fit.transpose()

In [None]:
colors = sns.color_palette(n_colors=15)
categorical_columns = [col for col in clustering_data.columns if len(clustering_data[col].unique()) < 5]

@interact(n_clusters=(1,10), group_by=categorical_columns)
def draw_plot(n_clusters, group_by):
    plt.clf()
    
    plt.rcParams['figure.figsize'] = (18, 15)
    instance = KMeans(n_clusters=n_clusters, random_state = 102)
    clusters_assignment = instance.fit_predict(clustering_data)
        
    plt.subplot(221)
    plt.scatter(matrix[0], matrix[1], s=50, c=[colors[i] for i in clusters_assignment])
    plt.title("K-means Clusters", fontsize=20)

    plt.subplot(222)
    column_unique_values = clustering_data[group_by].unique()
    plt.scatter(matrix[0], matrix[1], s=50, c=[colors[ind] for ind, val in enumerate(column_unique_values, 1)])
    plt.title("Clusters By '%s' Column" % group_by, fontsize=20)
    plt.show()

In [None]:
# colors = sns.color_palette(n_colors=15)
# categorical_columns = [col for col in clustering_data.columns if len(clustering_data[col].unique()) < 5]
# clusters_assignment = []


# @interact(n_clusters=(1,10), group_by=categorical_columns, choice=(1, 10), reset=False)
# def draw_plot(n_clusters, group_by, choice, reset):
#     global clustering_data
#     global clusters_assignment
#     plt.clf()
#     if choice > n_clusters:
#         print "Your choosen cluster cannot exceed n_clusters"
#         return
#     if reset:
#         clustering_data = pd.DataFrame(data.loc[:, column_names])
#     else:
#         indices = list(np.where(clusters_assignment == choice)[0])
#         print indices
#         clustering_data = clustering_data.loc[indices, :]
#     print clustering_data
    
    
#     pca =  PCA(n_components=2)
#     fit = pca.fit(clustering_data).transform(clustering_data)

#     matrix = fit.transpose()


#     plt.rcParams['figure.figsize'] = (17, 15)
#     instance = KMeans(n_clusters=n_clusters, random_state = 102)
#     clusters_assignment = instance.fit_predict(clustering_data)
#     print clusters_assignment
    
# #         print reset
# # #         reset = False
# #         return
# #     else:
# #         clustering_data = clustering_data.loc[clusters_assignment == choice,:]
    
#     plt.subplot(221)
#     plt.scatter(matrix[0], matrix[1], s=20, c=[colors[i] for i in clusters_assignment])

#     plt.subplot(222)
#     column_unique_values = clustering_data[group_by].unique()
#     plt.scatter(matrix[0], matrix[1], s=40, c=[colors[ind] for ind, val in enumerate(column_unique_values, 1)])
#     plt.show()

In [None]:
#TODO
# cusine & weight

# current diet & eating changes
# weight & eating out
# weight & exercise 
# nutritional_check
# on_off_campu
# parents_cook
# sports


#
#data.Gender.value_counts()
