# Pandas IO (a subset)
* read_*
* to_*

In [1]:
import pandas as pd

In [2]:
pd.read_

AttributeError: module 'pandas' has no attribute 'read_'

In [None]:
pd.to_

In [None]:
super_tiny_df = pd.DataFrame([1,2,3])

In [None]:
super_tiny_df.to_

# CSVs
Lots of data files will be in csv format.
This is one of the most common data formats out there,
and thankfully, pandas can read them very well.

## CSVs in the terminal


Before exploring CSVs with Pandas, check it out
in the command line. 

You can open a terminal through jupyter (maybe not in Windows).

Useful simple commands:
* head
* tail
* less

## CSVs in pandas

Most of the read_* functions in pandas have tons of 
optional arguments for handling all sorts of different
formats you might run into.

In [None]:
pd.read_csv()  

The jupyter docs are too crowded
go here instead
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

### Useful arguments:
* sep and delimiter
* header
* names
* index_col

There are so many. If you want to open a CSV in pandas and
the formatting is presenting problems, definitely check the
built-in functionality before reformatting stuff on your own.

In [None]:
pd.read_csv("../data.csv").head()

In [None]:
pd.read_csv("../data.csv", header=None).head()

In [None]:
really_awful_dataframe = pd.read_csv(
    "../data.csv", 
    names=["locs", "ages", "sexes", "years", "mort", "pop"],
    skiprows=1,
    dtype={"pop": "str"}
    ).head()
really_awful_dataframe["pop"][0]
type(really_awful_dataframe["pop"][0])

In [None]:
pd.read_csv(
    "../data.csv",
    header=None,
    skiprows=1,
    names=["location_id", "age_group_id", "sex_id", "year_id", "a", "b"],
    dtype="float").head()

In [None]:
pd.read_csv(
    "../data.csv",
    index_col=["location_id", "age_group_id", "sex_id", "year_id"],
    dtype="float").head()

In [None]:
# Another order for the index.
pd.read_csv(
    "../data.csv",
    index_col=["year_id", "location_id", "age_group_id", "sex_id"],
    dtype="float").head()

### Date time formatting!
Date and date formatting is so prevelant and
annoying, pandas helps you parse them.

Dates in a CSV are strings because everything
in a CSV is a string. Sometimes we have a
"DD-MM-YYYY" format, or sometimes "DDMMYYY",
or "YYYYMMDD", or sometimes it is really awful like
"YYYY----MM----!!!@@@DD"

In [None]:
pd.read_csv("05_datetime.csv")["date"]

In [None]:
pd.read_csv("05_datetime.csv", parse_dates=["date"])["date"][0]

In [None]:
crazy_date_data = pd.read_csv(
    "05_datetime.csv", 
    parse_dates=["crazy_date"])
crazy_date_data["crazy_date"]

In [None]:
pd.to_datetime(
    crazy_date_data["crazy_date"],
    format="%Y----%M----!!!@@@%d"
)
# There's a typo here! (%M isn't months...)

### Floating point precision problems
Most problems related to CSVs comes down
to everything being stored as strings.

e.g. instead of storing dates as Date objects,
they are stored as strings, which we need to parse.

Similarly, floating point numbers are stored
as strings with a limited number of characters.
This introduces inefficiencies in memory and a loss
of precision.

#### Memory inefficiency
We can store a single digit as a string in a single byte.
A floating point number is 32 or 64 bits, which is
4 or 8 bytes.


One ascii character is 1 byte, while more complicated
character encodings take several bytes.

In [None]:
# As a floating point, this is 64 bits (8 bytes)
floating_point = float(123 ** 2.3 / 1230001280001.128313)
str_floating_point = str(floating_point)
print("The string is {} bytes.".format(
    len(str_floating_point)))
print("A floating point is 4 or 8 bytes.")

#### Loss of precision
Floating points are weird. They're great for really big numbers
or really tiny numbers. They're awful if we want to represent
a large number with really high precision, e.g.
1000000000.0000000000000001.

In [None]:
# We're going to lose precision.
1000000000.000000000011111111

Strings are even worse.

In [None]:
import numpy as np

N = 1000000

small_value = np.random.rand(N) / 100000000.
df = pd.DataFrame(dict(
    big_value=100+small_value,
    small_value=small_value))

In [None]:
print(df["small_value"].min())
print("%1.30f" % df["big_value"].min())

In [None]:
df.to_csv("05_data.csv")
csv = pd.read_csv("05_data.csv")

print(csv["small_value"].min())
print("%1.30f" % csv["big_value"].min())

Introducing: HDF. 
* No further loss of precision! 
* Smaller file sizes! 
* Kaloo-kalay

In [None]:
df.to_hdf("05_data.hdf", "data")
hdf = pd.read_hdf("05_data.hdf")
print(hdf["small_value"].min())
print("%1.30f" % hdf["big_value"].min())

In [None]:
ls -l  # look at how small it is on disk!

# HDFs

HDF stands for Hierarchical Data Format.
You can do stuff with the h5py package to directly manipulate 
HDF files, but pandas has built-in functions to read
and write HDF.

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_hdf.html

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_hdf.html

There are two formats:
1. fixed
2. table (to be used with ``data_columns``)

They are useful for different scenarios.

# Code used to generate exercise data

In [None]:
import pandas as pd
import numpy as np
import math

EPSILON = 1e-4

HDF_FILE_1 = "05_forecast_1.hdf"
HDF_FILE_2 = "05_forecast_2.hdf"
CSV_FILE_1 = "05_observed_1.csv"
CSV_FILE_2 = "05_observed_2.hdf"

observed = pd.DataFrame(dict(
    person=[1, 2, 3] * 6,
    date=([1] * 3 + [2] * 3 + [3] * 3) * 2,
    foot=["L"] * 9 + ["R"] * 9,
    val=np.random.rand(18)
    ))
observed.to_csv(CSV_FILE_1)

In [None]:
observed2 = pd.DataFrame(dict(
    person=[1, 2, 3] * 6,
    date=([1] * 3 + [2] * 3 + [3] * 3) * 2,
    foot=["L"] * 9 + ["R"] * 9,
    val=np.random.rand(18) + 0.9
    ))
observed2.to_csv(CSV_FILE_2, index=False)

In [None]:
predicted = pd.DataFrame(dict(
    person=[1, 2, 3] * 6,
    date=([1] * 3 + [2] * 3 + [3] * 3) * 2,
    foot=["L"] * 9 + ["R"] * 9,
    val=np.random.rand(18)
    ))

predicted.set_index(["date", "foot"]).sort_index().to_hdf(
    HDF_FILE_1, "data")
predicted.set_index(["person", "foot", "date"]).sort_index().to_hdf(
    HDF_FILE_2, "data")

In [None]:
index = ["date", "foot", "person"]
print(
    (
        (
            (
                observed2.set_index(index) - predicted.set_index(index)
            ) ** 2
        ).sum() / len(observed) 
    ) ** 0.5)
print(
    (
        (
            (
                observed.set_index(index) - predicted.set_index(index)
            ) ** 2
        ).sum() / len(observed) 
    ) ** 0.5)

In [None]:
# My solution

def RMSE(hdf_file, csv_file):
    """Compute the RMSE of two timeseries stored in different formats.
    
    There is a model which writes its output as a HDF,
    and the observed data is saved as a CSV. Compute 
    the root mean squared error (RMSE) of the model 
    compared to the observed data.
    
    RMSE equation in LaTeX:
        \sqrt{[\sum_{i} (y_i - yhat_i) ^ 2] / 2}
        
    I'll write this on the board if someone needs it.
    
    The data looks something like this, but use Jupyter
    to explore the different files and their formats.
    
         date    foot    person    val
    0    1       L       1         0.356222
    1    1       L       2         0.395130
    2    1       L       3         0.180450
    3    2       L       1         0.192698
    4    2       L       2         0.363159
    
    Args:
        hdf_file: predicted data in an hdf file.
        csv_file: observed data in a csv file.
    Returns:
        float: the RMSE.
    """
    index = ["date", "foot", "person"]
        
    csv = pd.read_csv(csv_file, index_col=index)[["val"]]
    hdf = pd.read_hdf(hdf_file).reset_index().set_index(index)
    
    return (
        (
            (
                (
                    csv - hdf
                ) ** 2
            ).sum() / len(csv) 
        ) ** 0.5)
    
    
    raise NotImplementedError("You should probably implement this.")

In [None]:
# Run this to test your code.

def test_RMSE():
    res = RMSE(HDF_FILE_1, CSV_FILE_1)
    assert math.fabs(res - 0.416369) < EPSILON, res
    
    res = RMSE(HDF_FILE_2, CSV_FILE_1)
    assert math.fabs(res - 0.416369) < EPSILON, res 
    
    res = RMSE(HDF_FILE_1, CSV_FILE_2)
    assert math.fabs(res - 0.970608) < EPSILON, res
    
    res = RMSE(HDF_FILE_2, CSV_FILE_2)
    assert math.fabs(res - 0.970608) < EPSILON, res

test_RMSE()