In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/food_coded.csv')

## Objective Questions

1. Is there any correlation between favorite cuisine with the type of cuisine that the student eat growing up?
2. What image do students associate the word 'coffee' with?
3. What comfort food do students like?
4. Does being employed affect students' frequency of eating out?
5. Does more income mean more frequency of eating out?
6. Does self perception in weight affect how students see how much calories are in food?
7. Reasons for eating comfort food

In [53]:
used_columns = ['calories_scone', 'coffee', 'comfort_food', 'comfort_food_reasons', 'cuisine', 'eating_out', 'employment', 'fav_cuisine_coded',
               'income', 'self_perception_weight', 'tortilla_calories', 'waffle_calories']

In [54]:
df_used = df[used_columns].copy()

## Data Assessment

In [55]:
df_used.head()

Unnamed: 0,calories_scone,coffee,comfort_food,comfort_food_reasons,cuisine,eating_out,employment,fav_cuisine_coded,income,self_perception_weight,tortilla_calories,waffle_calories
0,315.0,1,none,we dont have comfort,,3,3.0,3,5.0,3.0,1165.0,1315
1,420.0,2,"chocolate, chips, ice cream","Stress, bored, anger",1.0,2,2.0,1,4.0,3.0,725.0,900
2,420.0,2,"frozen yogurt, pizza, fast food","stress, sadness",3.0,2,3.0,1,6.0,6.0,1165.0,900
3,420.0,2,"Pizza, Mac and cheese, ice cream",Boredom,2.0,2,3.0,3,6.0,5.0,725.0,1315
4,420.0,2,"Ice cream, chocolate, chips","Stress, boredom, cravings",2.0,2,2.0,1,6.0,4.0,940.0,760


In [56]:
df_used.shape

(125, 12)

In [57]:
df_used.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 12 columns):
calories_scone            124 non-null float64
coffee                    125 non-null int64
comfort_food              124 non-null object
comfort_food_reasons      124 non-null object
cuisine                   108 non-null float64
eating_out                125 non-null int64
employment                116 non-null float64
fav_cuisine_coded         125 non-null int64
income                    124 non-null float64
self_perception_weight    124 non-null float64
tortilla_calories         124 non-null float64
waffle_calories           125 non-null int64
dtypes: float64(6), int64(4), object(2)
memory usage: 11.8+ KB


In [58]:
missing_val_count_by_column = (df_used.isnull().sum())
missing_cols = missing_val_count_by_column[missing_val_count_by_column > 0].index.values

In [59]:
missing_cols

array(['calories_scone', 'comfort_food', 'comfort_food_reasons',
       'cuisine', 'employment', 'income', 'self_perception_weight',
       'tortilla_calories'], dtype=object)

In [61]:
for col in used_columns:
    if col == 'comfort_food' or col == 'comfort_food_reasons':
        continue
    print(col, '\n', df_used[col].value_counts(), '\n')

calories_scone 
 420.0    79
980.0    23
315.0    22
Name: calories_scone, dtype: int64 

coffee 
 2    94
1    31
Name: coffee, dtype: int64 

cuisine 
 1.0    86
2.0    13
4.0     3
3.0     3
6.0     2
5.0     1
Name: cuisine, dtype: int64 

eating_out 
 2    60
3    24
1    16
4    13
5    12
Name: eating_out, dtype: int64 

employment 
 2.0    60
3.0    54
1.0     2
Name: employment, dtype: int64 

fav_cuisine_coded 
 1    59
4    22
5    15
2    15
0     6
8     4
3     2
7     1
6     1
Name: fav_cuisine_coded, dtype: int64 

income 
 6.0    41
5.0    33
4.0    20
3.0    17
2.0     7
1.0     6
Name: income, dtype: int64 

self_perception_weight 
 3.0    45
2.0    31
4.0    31
1.0     6
5.0     6
6.0     5
Name: self_perception_weight, dtype: int64 

tortilla_calories 
 1165.0    46
940.0     43
725.0     22
580.0     13
Name: tortilla_calories, dtype: int64 

waffle_calories 
 1315    62
900     38
760     22
575      3
Name: waffle_calories, dtype: int64 



## Data Cleaning

### Quality

* Some columns have missing data, the column names with missing values are saved as `missing_cols`. As we will do exploratory with visualization, imputing the missing values will give wrong visualization. Thus rows with missing values will be removed on the go during the visual creation. We will fill missing vals with -99 for identification
* calories_scone is ordinal, it will be converted to 1: 107, 2: 315, 3: 420, 4: 980
* tortilla_calories is ordinal, it will be converted to 1: 520, 2: 725, 3: 940, 4: 1165
* waffle_calories is ordinal, it will be converted to 1: 575, 2:760, 3:900, 4:1315    

### Tidiness

* There are 2 columns with similar info, comfort_food_reasons_coded.1 and comfort_food_reasons_coded. However the data is inconsistent, as such we will extract the comfort_food_reasons on our own


In [62]:
df_clean = df_used.copy()

In [63]:
def convert_calories_scone(x):
    if x == 107:
        return 1
    if x == 315:
        return 2
    if x == 420:
        return 3
    if x == 980:
        return 4
    # indicate missing values
    return -99

def convert_tortilla_calories(x):
    if x == 520:
        return 1
    if x == 725:
        return 2
    if x == 940:
        return 3
    if x == 1165:
        return 4
    # indicate missing values
    return -99

def convert_waffle_calories(x):
    if x == 575:
        return 1
    if x == 760:
        return 2
    if x == 900:
        return 3
    if x == 1315:
        return 4
    # indicate missing values
    return -99

In [64]:
# Convert to ordinal
df_clean['calories_scone'] = df_clean['calories_scone'].apply(convert_calories_scone)

In [67]:
df_clean['calories_scone'].value_counts()

 3     79
 4     23
 2     22
-99     1
Name: calories_scone, dtype: int64

In [65]:
df_clean['tortilla_calories'] = df_clean['tortilla_calories'].apply(convert_tortilla_calories)

In [68]:
df_clean['tortilla_calories'].value_counts()

 4     46
 3     43
 2     22
-99    14
Name: tortilla_calories, dtype: int64

In [66]:
df_clean['waffle_calories'] = df_clean['waffle_calories'].apply(convert_waffle_calories)

In [69]:
df_clean['waffle_calories'].value_counts()

4    62
3    38
2    22
1     3
Name: waffle_calories, dtype: int64

In [73]:
# fill missing values with -99 for numerical
for col in used_columns:
    if col == 'comfort_food' or col == 'comfort_food_reasons':
        continue
    df_clean[col].fillna(-99, inplace=True)

In [74]:
df_clean.isnull().sum()

calories_scone            0
coffee                    0
comfort_food              1
comfort_food_reasons      1
cuisine                   0
eating_out                0
employment                0
fav_cuisine_coded         0
income                    0
self_perception_weight    0
tortilla_calories         0
waffle_calories           0
dtype: int64

In [76]:
df_clean['comfort_food'].head(20)

0                                                  none
1                           chocolate, chips, ice cream
2                       frozen yogurt, pizza, fast food
3                      Pizza, Mac and cheese, ice cream
4                          Ice cream, chocolate, chips 
5                             Candy, brownies and soda.
6          Chocolate, ice cream, french fries, pretzels
7                      Ice cream, cheeseburgers, chips.
8                              Donuts, ice cream, chips
9                 Mac and cheese, chocolate, and pasta 
10    Pasta, grandma homemade chocolate cake anythin...
11               chocolate, pasta, soup, chips, popcorn
12                          Cookies, popcorn, and chips
13                           ice cream, cake, chocolate
14      Pizza, fruit, spaghetti, chicken and Potatoes  
15                          cookies, donuts, candy bars
16                         Saltfish, Candy and Kit Kat 
17                            chips, cookies, ic

In [None]:
# join all rows in the comfort food into a single vector
# get most frequent food mentioned
# get only first fav food mentioned