In [111]:
# for vectorized operations
import numpy as np

# for dataframe manipulation
import pandas as pd

# for vizualizations
import matplotlib.pyplot as plt
import seaborn as sns

# for statistical calculations
import scipy.stats as stats

# for obtaining stock datasets
from pydataset import data

# for manipulation of time data
from datetime import date

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# filter out warnings
import warnings
warnings.filterwarnings('ignore')

# our own functions for accessing our sql database
from env import get_db_url, user, password, host

# our own acquire script
import acquire

# show all columns for df.head()
pd.set_option("display.max_columns", None)
#pd.set_option("display.max_rows", None)


# Tidy Data

### 1. Attendance Data

In [112]:
url = get_db_url('tidy_data')
sql = '''
SELECT *
FROM attendance
'''

attendance = pd.read_sql(sql, url)

In [153]:
df = attendance
df

Unnamed: 0.1,Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


#### Calculate an attendance percentage for each student. One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

In [114]:
# one observation = one student on a given day

In [115]:
df = df.melt(id_vars='Unnamed: 0')

In [116]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,variable,value
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A
2,Billy,2018-01-01,A


In [117]:
df.columns = ['student', 'date', 'attendance']

In [118]:
df.head(3)

Unnamed: 0,student,date,attendance
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A
2,Billy,2018-01-01,A


In [119]:
df['attendance'] = df.attendance.map({'P': 1, 'A': 0, 'H': .5, 'T': .9})

In [120]:
df.head(3)

Unnamed: 0,student,date,attendance
0,Sally,2018-01-01,1.0
1,Jane,2018-01-01,0.0
2,Billy,2018-01-01,0.0


In [121]:
df.groupby(by='student').attendance.mean()

student
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: attendance, dtype: float64

### 2. Coffee Levels

#### 2a. Read the coffee_levels table

In [122]:
url = get_db_url('tidy_data')
sql = '''
SELECT *
FROM coffee_levels
'''
coffee_levels = pd.read_sql(sql, url)

In [123]:
df = coffee_levels

In [124]:
df.head(3)

Unnamed: 0,hour,coffee_carafe,coffee_amount
0,8,x,0.816164
1,9,x,0.451018
2,10,x,0.843279


#### 2b. Transform the data so that each carafe is in it's own column

In [125]:
df = df.pivot(index='hour', columns='coffee_carafe')
df.head(3)

Unnamed: 0_level_0,coffee_amount,coffee_amount,coffee_amount
coffee_carafe,x,y,z
hour,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
8,0.816164,0.189297,0.999264
9,0.451018,0.521502,0.91599
10,0.843279,0.023163,0.144928


In [126]:
df = df.reset_index()

In [127]:
df.head(3)

Unnamed: 0_level_0,hour,coffee_amount,coffee_amount,coffee_amount
coffee_carafe,Unnamed: 1_level_1,x,y,z
0,8,0.816164,0.189297,0.999264
1,9,0.451018,0.521502,0.91599
2,10,0.843279,0.023163,0.144928


In [128]:
df.columns.names = [None, None]

In [129]:
df.head(3)

Unnamed: 0_level_0,hour,coffee_amount,coffee_amount,coffee_amount
Unnamed: 0_level_1,Unnamed: 1_level_1,x,y,z
0,8,0.816164,0.189297,0.999264
1,9,0.451018,0.521502,0.91599
2,10,0.843279,0.023163,0.144928


#### 2c. Is this the best shape for this data?

Whether this is the "best" shape for the data is subjective, but I would argue that this does not constitute "tidy" data. A good definition of a single observation for this data would be: the amount of coffee in a given carafe at a given hour. By this definition, we should have columns: `['hour', 'carafe', 'coffee_level']`

### 3. Cake Recipes

#### 3a. Read the cake_recipes table. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.

In [131]:
url = get_db_url('tidy_data')
sql = '''
SELECT *
  FROM cake_recipes
'''
cake_recipes = pd.read_sql(sql, url)

In [132]:
df = cake_recipes

In [133]:
df.head(3)

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541


In [134]:
df['recipe'] = df['recipe:position'].str.split(':').str[0]
df['position'] = df['recipe:position'].str.split(':').str[1]
df = df.drop(columns='recipe:position')

In [135]:
df = df.melt(id_vars=['recipe', 'position'], var_name='oven_temp', value_name='score')

In [139]:
df = df.rename(columns={'position': 'oven_rack_position'})

In [140]:
df.head(3)

Unnamed: 0,recipe,oven_rack_position,oven_temp,score
0,a,bottom,225,61.738655
1,a,top,225,51.709751
2,b,bottom,225,57.09532


#### 3c. Which recipe, on average, is the best?

In [142]:
df.groupby(by='recipe').score.mean().idxmax()

'b'

#### 3d. Which oven temperature, on average, produces the best results?

In [144]:
df.groupby(by='oven_temp').score.mean().idxmax()

'275'

#### 3e. Which combination of recipe, rack position, and temperature gives the best result?


In [152]:
df[df.score == df.score.max()]

Unnamed: 0,recipe,oven_rack_position,oven_temp,score
26,b,bottom,300,99.248541
