In [2]:
import env
import pandas as pd
import pydataset as data
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# turn off pink boxes for demo
import warnings
warnings.filterwarnings("ignore")

# import our own acquire module
import acquire

### 1. Attendance Data

#### Load the attendance.csv file and calculate an attendance percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

In [3]:
attendance = pd.read_csv('untidy-data/attendance.csv')

In [4]:
attendance

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [5]:
attendance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  4 non-null      object
 1   2018-01-01  4 non-null      object
 2   2018-01-02  4 non-null      object
 3   2018-01-03  4 non-null      object
 4   2018-01-04  4 non-null      object
 5   2018-01-05  4 non-null      object
 6   2018-01-06  4 non-null      object
 7   2018-01-07  4 non-null      object
 8   2018-01-08  4 non-null      object
dtypes: object(9)
memory usage: 416.0+ bytes


In [6]:
attendance.rename(columns={'Unnamed: 0': 'Student'}, inplace=True)

In [7]:
attendance

Unnamed: 0,Student,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


In [8]:
attendance_melt = attendance.replace({'P': 1, 'T': .9, 'H': .5, 'A': 0})

In [10]:
attendance_melt = attendance_melt.melt(id_vars='Student', var_name='Date', value_name='Attendance')

In [11]:
attendance_melt

Unnamed: 0,Student,Date,Attendance
0,Sally,2018-01-01,1.0
1,Jane,2018-01-01,0.0
2,Billy,2018-01-01,0.0
3,John,2018-01-01,1.0
4,Sally,2018-01-02,0.9
5,Jane,2018-01-02,1.0
6,Billy,2018-01-02,0.9
7,John,2018-01-02,0.9
8,Sally,2018-01-03,0.9
9,Jane,2018-01-03,0.9


In [12]:
mean_attendance = attendance_melt.groupby('Student').Attendance.mean()

In [13]:
mean_attendance

Student
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: Attendance, dtype: float64

### 2. Coffee Levels

#### Read the coffee_levels.csv file.
#### Transform the data so that each carafe is in it's own column.
#### Is this the best shape for the data?

In [14]:
coffee_data = pd.read_csv('untidy-data/coffee_levels.csv')

In [16]:
coffee_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   hour           30 non-null     int64  
 1   coffee_carafe  30 non-null     object 
 2   coffee_amount  30 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 848.0+ bytes


In [21]:
coffee_data.head()

Unnamed: 0,hour,coffee_carafe,coffee_amount
0,8,x,0.816164
1,9,x,0.451018
2,10,x,0.843279
3,11,x,0.335533
4,12,x,0.898291


In [None]:
# pivot the element column. Reset index to go from multi-index to flat dataframe.

weather_tidy = weather_long.pivot_table(index = ['id', 'year', 'month', 'day'],
                                       columns = 'element',
                                       values = 'temp',
                                       aggfunc = 'mean').reset_index()

weather_tidy

In [27]:
coffee_tidy = coffee_data.pivot_table(index = ['hour'],
                                     columns = 'coffee_carafe',
                                     values = 'coffee_amount',
                                     aggfunc = 'mean').reset_index()

In [28]:
coffee_tidy

coffee_carafe,hour,x,y,z
0,8,0.816164,0.189297,0.999264
1,9,0.451018,0.521502,0.91599
2,10,0.843279,0.023163,0.144928
3,11,0.335533,0.235529,0.311495
4,12,0.898291,0.017009,0.771947
5,13,0.310711,0.997464,0.39852
6,14,0.507288,0.058361,0.864464
7,15,0.215043,0.144644,0.436364
8,16,0.183891,0.544676,0.280621
9,17,0.39156,0.594126,0.436677


### 3. Cake Recipes

#### Read the cake_recipes.csv data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.
#### Tidy the data as necessary.
#### Which recipe, on average, is the best? recipe b
#### Which oven temperature, on average, produces the best results? 275
#### Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

In [30]:
cake_recipes = pd.read_csv('untidy-data/cake_recipes.csv')

In [31]:
cake_recipes

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541
3,b:top,82.455004,95.224151,98.594881,58.169349
4,c:bottom,96.470207,52.001358,92.893227,65.473084
5,c:top,71.306308,82.795477,92.098049,53.960273
6,d:bottom,52.799753,58.670419,51.747686,56.18311
7,d:top,96.873178,76.101363,59.57162,50.971626


In [34]:
cake_recipes['recipe:position'].str.split(':', expand=True)

Unnamed: 0,0,1
0,a,bottom
1,a,top
2,b,bottom
3,b,top
4,c,bottom
5,c,top
6,d,bottom
7,d,top


In [39]:
cake_recipes[['recipe','position']] = cake_recipes['recipe:position'].str.split(':', expand=True)

Unnamed: 0,recipe:position,225,250,275,300,recipe,position
0,a:bottom,61.738655,53.912627,74.41473,98.786784,a,bottom
1,a:top,51.709751,52.009735,68.576858,50.22847,a,top
2,b:bottom,57.09532,61.904369,61.19698,99.248541,b,bottom
3,b:top,82.455004,95.224151,98.594881,58.169349,b,top
4,c:bottom,96.470207,52.001358,92.893227,65.473084,c,bottom
5,c:top,71.306308,82.795477,92.098049,53.960273,c,top
6,d:bottom,52.799753,58.670419,51.747686,56.18311,d,bottom
7,d:top,96.873178,76.101363,59.57162,50.971626,d,top


In [40]:
cake_recipes.drop(columns = 'recipe:position', inplace = True)

In [41]:
cake_recipes

Unnamed: 0,225,250,275,300,recipe,position
0,61.738655,53.912627,74.41473,98.786784,a,bottom
1,51.709751,52.009735,68.576858,50.22847,a,top
2,57.09532,61.904369,61.19698,99.248541,b,bottom
3,82.455004,95.224151,98.594881,58.169349,b,top
4,96.470207,52.001358,92.893227,65.473084,c,bottom
5,71.306308,82.795477,92.098049,53.960273,c,top
6,52.799753,58.670419,51.747686,56.18311,d,bottom
7,96.873178,76.101363,59.57162,50.971626,d,top


In [42]:
# melt data
cake_recipes_melt = cake_recipes.melt(id_vars = ['recipe','position'], var_name = 'temperature', value_name = 'score')

In [43]:
cake_recipes_melt.head()

Unnamed: 0,recipe,position,temperature,score
0,a,bottom,225,61.738655
1,a,top,225,51.709751
2,b,bottom,225,57.09532
3,b,top,225,82.455004
4,c,bottom,225,96.470207


In [45]:
# Which recipe, on average, is the best?

cake_recipes_melt.groupby(['recipe']).score.mean()

recipe
a    63.922201
b    76.736074
c    75.874748
d    62.864844
Name: score, dtype: float64

In [48]:
best_recipe = cake_recipes_melt.groupby(['recipe']).score.mean().idxmax()
best_score = cake_recipes_melt.groupby(['temperature']).score.mean().max()
print(f' The best temperature is \'{best_temp})

In [None]:
best_comb = cake_recipes_melt