In [1]:
import pandas as pd

### 1. Attendance Data

### 1-a. Load the attendance.csv file

In [72]:
attendance = pd.read_csv('untidy-data/attendance.csv', index_col=0)
attendance.index.set_names('name', inplace=True)
attendance.reset_index(inplace=True)
attendance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  4 non-null      object
 1   2018-01-01  4 non-null      object
 2   2018-01-02  4 non-null      object
 3   2018-01-03  4 non-null      object
 4   2018-01-04  4 non-null      object
 5   2018-01-05  4 non-null      object
 6   2018-01-06  4 non-null      object
 7   2018-01-07  4 non-null      object
 8   2018-01-08  4 non-null      object
dtypes: object(9)
memory usage: 416.0+ bytes


In [73]:
attendance.head(2)

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T


### 1-b. Calculate an attendance percentage for each student

**Notes**:
* P: present --> 1
* A: Abscent --> 0
* H: Half day --> 0.5
* T: Tardy --> 0.9

In [66]:
attendance = attendance.melt(id_vars=['name']).drop(columns='variable')

In [67]:
attendance['value'] = attendance.value.map({'P':1, 'A':0, 'H':0.5, 'T':0.9})

In [68]:
attendance = attendance.groupby('name').value.mean()

In [69]:
attendance = attendance.rename('grade')

In [70]:
attendance

name
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: grade, dtype: float64

### 2. Coffee Levels

### 2-a. Read the `coffee_levels.csv ` file.

In [99]:
coffee_levels = pd.read_csv('untidy-data/coffee_levels.csv')
coffee_levels.sample(10)

Unnamed: 0,hour,coffee_carafe,coffee_amount
14,12,y,0.017009
26,14,z,0.864464
25,13,z,0.39852
21,9,z,0.91599
10,8,y,0.189297
1,9,x,0.451018
9,17,x,0.39156
8,16,x,0.183891
24,12,z,0.771947
4,12,x,0.898291


### 2-b. Transform the data so that each carafe is in it's own column.

In [100]:
coffee_levels.coffee_carafe.value_counts()

y    10
x    10
z    10
Name: coffee_carafe, dtype: int64

In [104]:
coffee_levels = coffee_levels.pivot_table('coffee_amount', 'hour', 'coffee_carafe')
coffee_levels

coffee_carafe,x,y,z
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,0.816164,0.189297,0.999264
9,0.451018,0.521502,0.91599
10,0.843279,0.023163,0.144928
11,0.335533,0.235529,0.311495
12,0.898291,0.017009,0.771947
13,0.310711,0.997464,0.39852
14,0.507288,0.058361,0.864464
15,0.215043,0.144644,0.436364
16,0.183891,0.544676,0.280621
17,0.39156,0.594126,0.436677


### c. Is this the best shape for the data? 

No:
1. two variables are in one columns: coffee_carafe and coffee_amount
2. Each row has three obervations

### 3. Cake Recipes
### 3-a. Read the `cake_recipes.csv` data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.

In [114]:
cake_recipes = pd.read_csv('untidy-data/cake_recipes.csv')
cake_recipes

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541
3,b:top,82.455004,95.224151,98.594881,58.169349
4,c:bottom,96.470207,52.001358,92.893227,65.473084
5,c:top,71.306308,82.795477,92.098049,53.960273
6,d:bottom,52.799753,58.670419,51.747686,56.18311
7,d:top,96.873178,76.101363,59.57162,50.971626


In [120]:
recipes_and_position = cake_recipes['recipe:position'].str.split(':', expand=True)
recipes_and_position.columns = ['recipes', 'position']

cake_recipes = pd.concat([cake_recipes, recipes_and_position], axis=1).drop(columns='recipe:position')

In [125]:
cake_recipes = cake_recipes.melt(id_vars = ['recipes','position'], var_name='temp', value_name='grade')

In [126]:
cake_recipes.head()

Unnamed: 0,recipes,position,temp,grade
0,a,bottom,225,61.738655
1,a,top,225,51.709751
2,b,bottom,225,57.09532
3,b,top,225,82.455004
4,c,bottom,225,96.470207


### 3-c. Which recipe, on average, is the best?

In [139]:
cake_recipes.groupby('recipes').grade.mean().sort_values(ascending=False).head(1)

recipes
b    76.736074
Name: grade, dtype: float64

### 3-d. Which oven temperature, on average, produces the best results?

In [138]:
cake_recipes.groupby('temp').grade.mean().sort_values(ascending=False).head(1)

temp
275    74.886754
Name: grade, dtype: float64

### 3-e. Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

In [137]:
cake_recipes.groupby(['recipes','position','temp']).grade.mean().sort_values(ascending=False).head(1)

recipes  position  temp
b        bottom    300     99.248541
Name: grade, dtype: float64