In [2]:
import pandas as pd

## Tidy Data
- tabular data
- one value per cell
- one val/column
- one obs/row

## Melt

In [3]:
df = pd.read_csv('./untidy_data/treatments.csv')

In [4]:
df

Unnamed: 0,Unnamed: 1,treatmenta,treatmentb,treatmentc
0,John Smith,,2,0
1,Jane Doe,16.0,11,3
2,Mary Johnson,3.0,1,4


In [5]:
df.columns = ['subject_name', 'treatment_a', 'treatment_b', 'treatment_c']

In [6]:
df

Unnamed: 0,subject_name,treatment_a,treatment_b,treatment_c
0,John Smith,,2,0
1,Jane Doe,16.0,11,3
2,Mary Johnson,3.0,1,4


In [7]:
df = df.melt(id_vars='subject_name', var_name='treatment', value_name='response')
# variable name is description of what columns are
# value name is description of what numbers are

In [8]:
df

Unnamed: 0,subject_name,treatment,response
0,John Smith,treatment_a,
1,Jane Doe,treatment_a,16.0
2,Mary Johnson,treatment_a,3.0
3,John Smith,treatment_b,2.0
4,Jane Doe,treatment_b,11.0
5,Mary Johnson,treatment_b,1.0
6,John Smith,treatment_c,0.0
7,Jane Doe,treatment_c,3.0
8,Mary Johnson,treatment_c,4.0


In [9]:
# cleanup the treatment column
df.treatment = df.treatment.str.replace('treatment_', '')

In [10]:
df

Unnamed: 0,subject_name,treatment,response
0,John Smith,a,
1,Jane Doe,a,16.0
2,Mary Johnson,a,3.0
3,John Smith,b,2.0
4,Jane Doe,b,11.0
5,Mary Johnson,b,1.0
6,John Smith,c,0.0
7,Jane Doe,c,3.0
8,Mary Johnson,c,4.0


## Pivot Table

In [11]:
df = pd.read_csv('./untidy_data/students.csv')
df.head()
# there are separate variable with separate values mashed together in one column
# when units are off, thats a good tip off working with untidy data

Unnamed: 0,date,var,val
0,2019-02-04,n_late_from_break,4.02812
1,2019-02-04,coffee_consumption,5255.40974
2,2019-02-04,classroom_temp,67.0
3,2019-02-05,n_late_from_break,2.101998
4,2019-02-05,coffee_consumption,8603.704719


In [12]:
# show values, then rows, then columns
df.pivot_table('val', 'date', 'var').head()
# data is in a tidy format now since we have one value per column
# different units, strong hint that there are multiple vars to extract

var,classroom_temp,coffee_consumption,n_late_from_break
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-02-04,67.0,5255.40974,4.02812
2019-02-05,73.0,8603.704719,2.101998
2019-02-06,81.0,1801.49805,4.941244
2019-02-07,62.0,9282.959741,1.419342
2019-02-08,72.0,7558.270659,1.808919


In [13]:
df = pd.read_csv('./untidy_data/sales.csv')
df
# untidy data because of combination of year/var name 

Unnamed: 0,Product,2016 Sales,2016 PPU,2017 Sales,2017 PPU,2018 Sales,2018 PPU
0,A,673,5,231,7,173,9
1,B,259,3,748,5,186,8
2,C,644,3,863,5,632,5
3,D,508,9,356,11,347,14


In [14]:
# first melt, then split year/var combo, then spread to put into tidy format
df = df.melt(id_vars="Product")
df.head(7)

Unnamed: 0,Product,variable,value
0,A,2016 Sales,673
1,B,2016 Sales,259
2,C,2016 Sales,644
3,D,2016 Sales,508
4,A,2016 PPU,5
5,B,2016 PPU,3
6,C,2016 PPU,3


In [15]:
# then extract var name/year
df['year'] = df.variable.str.extract(r'^(\d+)')

In [16]:
df.head()

Unnamed: 0,Product,variable,value,year
0,A,2016 Sales,673,2016
1,B,2016 Sales,259,2016
2,C,2016 Sales,644,2016
3,D,2016 Sales,508,2016
4,A,2016 PPU,5,2016


In [17]:
df['var_name'] = df.variable.str.extract(r'^\d+\s(.*$)')

In [18]:
df.head()

Unnamed: 0,Product,variable,value,year,var_name
0,A,2016 Sales,673,2016,Sales
1,B,2016 Sales,259,2016,Sales
2,C,2016 Sales,644,2016,Sales
3,D,2016 Sales,508,2016,Sales
4,A,2016 PPU,5,2016,PPU


In [19]:
df = df.drop(columns='variable')

In [20]:
df.head()

Unnamed: 0,Product,value,year,var_name
0,A,673,2016,Sales
1,B,259,2016,Sales
2,C,644,2016,Sales
3,D,508,2016,Sales
4,A,5,2016,PPU


In [None]:
#df.pivot('value', ['Product', 'year'], 'var_name')

In [None]:
#df.columns.name = ''

In [None]:
#df.head()