# Old Faithful

In [None]:
# Data file in this notebook is from https://www.stat.cmu.edu/~larry/all-of-statistics/=data/faithful.dat
# The original paper is available as https://tommasorigon.github.io/StatI/approfondimenti/Azzalini1990.pdf

In [None]:
# Standard definitions and options
import pandas as pd
from pandas import DataFrame
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10.0, 5.0)

In [None]:
# read in the data from a CSV file
data = pd.read_csv("oldfaithful.csv")

In [None]:
#take a look at the data
data.head(10)

In [None]:
# old faithful is famous for its repeatability - lets check some statistics
data['Interval'].mean()

In [None]:
data['Interval'].std()

In [None]:
data['Interval'].min()

In [None]:
data['Interval'].max()

In [None]:
# while we're here, let's look at the other data we have
data['Duration'].mean(), data['Duration'].std()

In [None]:
# let's take a look at what the distribution looks like
plt.hist(data['Interval'], bins=40)
plt.figtext(0.75,0.5, data['Interval'].describe().to_string())
plt.title("Interval");

In [None]:
# maybe there's two peaks there. But that still doesn't give us a better way to predict the eruption.
# Look at other information we have:
plt.hist(data['Duration'], bins=40)
plt.figtext(0.3,0.5, data['Duration'].describe().to_string())
plt.title("Duration");

In [None]:
# maybe there's a correlation?
np.corrcoef(data['Duration'], data['Interval'])

In [None]:
# that's pretty strong, let's look at it
plt.plot(data['Duration'], data['Interval']);

In [None]:
plt.plot(data['Duration'], data['Interval'],"ob");

In [None]:
# there seems to be two populations there!

# if we select just one:
long_data = data[data['Duration'] > 3.2]
plt.hist(long_data['Duration'], bins=20)
plt.figtext(0.2,0.5, long_data['Duration'].describe().to_string())
plt.title("Duration");

In [None]:
# but of course duration is more compact because we selected a narrower range,  How about interval?
plt.hist(long_data['Interval'], bins=20)
plt.figtext(0.75,0.5, long_data['Interval'].describe().to_string())
plt.title("Interval with Duration > 3,2");

In [None]:
# We're down to 50% in 8 minutes and an RMS of 6 minutes on a mean of 80; 10%!
#
# The shorter duration blob is left for the reader...

In [None]:
# Try fitting a line instead
d = np.polyfit(data['Duration'], data['Interval'],1)
f = np.poly1d(d)
data['trendline'] = f(data['Duration'])

plt.plot(data['Duration'], data['Interval'],"ob");
plt.plot(data['Duration'], data['trendline'],"k");

In [None]:
# see how wide the difference from the linear fit is
plt.hist(data['Interval']-data['trendline'], 20)
plt.figtext(0.75,0.5, (data['Interval']-data['trendline']).describe().to_string())
plt.title("DIfference from Fit");

In [None]:
# performance is about the same.  Is there a reason to prefer one method over another here?


In [None]:
# try plotting duration and wait time vs the event number. Is there a pattern there?

In [None]:
plt.plot(data['N'], data['Interval'],"b");

In [None]:
plt.plot(data['N'], data['Duration'],"b");

In [None]:
# did points or lines make the display clearer?
# you can calculate the difference to the previous value and add it as a column:
data['deltaD'] = data["Duration"].diff()
data['deltaD']

In [None]:
data['deltaI'] = data["Interval"].diff()
data['deltaI']

In [None]:
# Plot deltaD vs deltaI and see if there's any grouping

In [None]:
plt.plot(data['deltaD'], data['deltaI'],"ob");

In [None]:
# does that look like three groups?  Is there a way to use this?

In [None]:
plt.hist(data['deltaD'], 20);

In [None]:
plt.plot(data['deltaD'], data['Interval'],"ob");

In [None]:
# Two of those groups look tightly clustered. But something goes wrong when Duration doesn't alternate...
# What data selection and plots would help you look at that?