# Tidy Data

In [1]:
import pandas as pd

## Exercise 1

Attendance Data

Load the `attendance.csv` file and calculate an attendnace percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

You should end up with something like this:
```
name
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: grade, dtype: float64
```

In [4]:
df = pd.read_csv('untidy-data/attendance.csv')
df

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [12]:
df.columns

Index(['Unnamed: 0', '2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
       '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
      dtype='object')

In [14]:
#rename the name column
df = df.rename(columns = {'Unnamed: 0': 'name'})
df

Unnamed: 0,name,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [16]:
df = df.melt(id_vars='name', var_name='date', value_name='record')
df.head()

Unnamed: 0,name,date,record
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A
2,Billy,2018-01-01,A
3,John,2018-01-01,P
4,Sally,2018-01-02,T


In [17]:
def convert_attendance(record):
    if record == "P":
        return 1
    elif record == "A":
        return 0
    elif record == "H":
        return 0.5
    else:
        return .9

In [18]:
df["rate"] = df.record.apply(convert_attendance)
df.head()

Unnamed: 0,name,date,record,rate
0,Sally,2018-01-01,P,1.0
1,Jane,2018-01-01,A,0.0
2,Billy,2018-01-01,A,0.0
3,John,2018-01-01,P,1.0
4,Sally,2018-01-02,T,0.9


In [19]:
#Average attendence 
df.groupby('name').rate.mean()

name
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: rate, dtype: float64

## Exercise 2
Coffee Levels

1. Read the `coffee_levels.csv` file.
- Transform the data so that each carafe is in it's own column.
- Is this the best shape for the data?

In [54]:
df = pd.read_csv('untidy-data/coffee_levels.csv')
df.sample(5)

Unnamed: 0,hour,coffee_carafe,coffee_amount
11,9,y,0.521502
1,9,x,0.451018
7,15,x,0.215043
26,14,z,0.864464
4,12,x,0.898291


In [25]:
df.coffee_carafe.value_counts()

x    10
z    10
y    10
Name: coffee_carafe, dtype: int64

In [26]:
df.hour.value_counts()

17    3
16    3
15    3
14    3
13    3
12    3
11    3
10    3
9     3
8     3
Name: hour, dtype: int64

In [55]:
#Transform the data so that each carafe is in it's own column.
df = df.pivot_table(values='coffee_amount', index='hour', columns=['coffee_carafe'])
df

coffee_carafe,x,y,z
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,0.816164,0.189297,0.999264
9,0.451018,0.521502,0.91599
10,0.843279,0.023163,0.144928
11,0.335533,0.235529,0.311495
12,0.898291,0.017009,0.771947
13,0.310711,0.997464,0.39852
14,0.507288,0.058361,0.864464
15,0.215043,0.144644,0.436364
16,0.183891,0.544676,0.280621
17,0.39156,0.594126,0.436677


In [62]:
#Is this the best shape for the data?
df = df.transpose()
df

hour,8,9,10,11,12,13,14,15,16,17
coffee_carafe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
x,0.816164,0.451018,0.843279,0.335533,0.898291,0.310711,0.507288,0.215043,0.183891,0.39156
y,0.189297,0.521502,0.023163,0.235529,0.017009,0.997464,0.058361,0.144644,0.544676,0.594126
z,0.999264,0.91599,0.144928,0.311495,0.771947,0.39852,0.864464,0.436364,0.280621,0.436677


## Exercise 3
Cake Recipes

1. Read the `cake_recipes.csv` data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.
- Tidy the data as necessary.
- Which recipe, on average, is the best? recipe b
- Which oven temperature, on average, produces the best results? 275
- Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

In [36]:
df = pd.read_csv('untidy-data/cake_recipes.csv')
df.head()

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541
3,b:top,82.455004,95.224151,98.594881,58.169349
4,c:bottom,96.470207,52.001358,92.893227,65.473084


In [40]:
recipe = df['recipe:position'].str.split(':', expand=True)
recipe.head()

Unnamed: 0,0,1
0,a,bottom
1,a,top
2,b,bottom
3,b,top
4,c,bottom


In [41]:
recipe.columns = ['recipe','position']
recipe

Unnamed: 0,recipe,position
0,a,bottom
1,a,top
2,b,bottom
3,b,top
4,c,bottom
5,c,top
6,d,bottom
7,d,top


In [43]:
df=pd.concat([df, recipe], axis =1).drop(columns = 'recipe:position')
df

Unnamed: 0,225,250,275,300,recipe,position
0,61.738655,53.912627,74.41473,98.786784,a,bottom
1,51.709751,52.009735,68.576858,50.22847,a,top
2,57.09532,61.904369,61.19698,99.248541,b,bottom
3,82.455004,95.224151,98.594881,58.169349,b,top
4,96.470207,52.001358,92.893227,65.473084,c,bottom
5,71.306308,82.795477,92.098049,53.960273,c,top
6,52.799753,58.670419,51.747686,56.18311,d,bottom
7,96.873178,76.101363,59.57162,50.971626,d,top


In [46]:
#melt data
df =df.melt(id_vars=['recipe','position'],var_name='temperature', value_name='score' )
df.head()

Unnamed: 0,recipe,position,temperature,score
0,a,bottom,225,61.738655
1,a,top,225,51.709751
2,b,bottom,225,57.09532
3,b,top,225,82.455004
4,c,bottom,225,96.470207


In [47]:
#Which recipe, on average, is the best? recipe b
df.groupby('recipe').score.mean()
# recipe b got highest score

recipe
a    63.922201
b    76.736074
c    75.874748
d    62.864844
Name: score, dtype: float64

In [48]:
#Which oven temperature, on average, produces the best results? 275
df.groupby('temperature').score.mean()
# 275 degree has the vest results

temperature
225    71.306022
250    66.577437
275    74.886754
300    66.627655
Name: score, dtype: float64

In [52]:
#Which combination of recipe, rack position, and temperature gives the best result? 
#recipe b, bottom rack, 300 degrees
df.groupby(['recipe','position','temperature']).score.mean().nlargest()
# b,bottom,300 ,99.248541

recipe  position  temperature
b       bottom    300            99.248541
a       bottom    300            98.786784
b       top       275            98.594881
d       top       225            96.873178
c       bottom    225            96.470207
Name: score, dtype: float64