# Water physical properties

## Setings
### Notebook configuration

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

### Packages

In [None]:
import os  # manage path and os interaction
import numpy as np  # manage array and math operation
import pandas as pd  # manage table

import matplotlib as mpl  # plot data
import matplotlib.pyplot as plt  # plot data

## Import data

In [None]:
data_path = '../data/water/water.csv'
df_0 = pd.read_csv(data_path)

## Explore data

In [None]:
df_0.head(n=3)

## Correct import

The second line of the header should be taken out

In [None]:
df_1 = pd.read_csv(data_path, header=1)

df_1.head()

## Replace column names

In [None]:
df_0.columns

In [None]:
df_1.columns = df_0.columns

df_1.head()

## Set index

In [None]:
df_1.set_index('Temperature').head(n=5)

The operation generated a copy, to apply directly the operation use inplace.

In [None]:
df_1.set_index('Temperature', inplace=True)

## Plot Density

In [None]:
df_1['Density'].plot(kind='line')

Add axis to make a clean plot

In [None]:
x_var = df_1.index.name
y_var = 'Density'

plt.plot()
df_1[y_var].plot(kind='line')
plt.xlabel(x_var)
plt.ylabel(y_var)
plt.title(f'{y_var} v {x_var}')
plt.draw()

## View the curve between 45 and 55 degres

In [None]:
t_min = 45
t_max = 55

series_45_55 = df_1[(df_1.index >= t_min) & (df_1.index <= t_max)] [y_var]
series_45_55

In [None]:
plt.plot()
series_45_55.plot(x='a', y='b', style='o')
plt.xlabel(x_var)
plt.ylabel(y_var)
plt.title(f'{y_var} v {x_var}')
plt.draw()

## How can we make a prediction of the temperature at 50.5 C ?

### Interpolation

In [None]:
# Fit
y50 = series_45_55[50]
b_50_5 = (series_45_55[51] - series_45_55[50]) / 1.
delta_x_50_5 = 50.5 - 50

# Predict
yhat50_5 = y50 + b_50_5 * delta_x_50_5

In [None]:
plt.plot()
series_45_55.plot(x='a', y='b', style='o')

plt.plot([50.5], [yhat50_5], 'rx')

plt.xlabel(x_var)
plt.ylabel(y_var)
plt.title(f'{y_var} v {x_var}')
plt.draw()

print('Interpolated value:')
print(yhat50_5)

### Test method on known values

Test and train value should have the general behavior

Let us try to predict 51 usin 50 and 52.

In [None]:
# Fit
b_51 = (series_45_55[52] - series_45_55[50]) / 2.
delta_x_51 = 51 - 50

# Predict
yhat51 = y50 + b_51 * delta_x_51

In [None]:
print("Prediction on test")
print(yhat51)
print("")
print("True value")
print(series_45_55[51])
print("")
print("Error")
print(yhat51 - series_45_55[51])
print("")
print("MRSE")
print(np.sqrt(((yhat51 - series_45_55[51]) ** 2).mean()))

## Using a framework

### Split data

In [None]:
from sklearn.model_selection import train_test_split

## Define input and output
X, y = df_1.index.values, df_1['Density'].values

## 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
plt.plot()
plt.scatter(X_train, y_train, label='train')
plt.scatter(X_test, y_test, label='test')
plt.legend()
plt.show()

### Train algo

In [None]:
from catboost import CatBoostRegressor

In [None]:
more_algos = False
if more_algos:
    clf_2 = CatBoostRegressor(l2_leaf_reg=1e-2)
    clf_5 = CatBoostRegressor(l2_leaf_reg=1e-5)
    clf_7 = CatBoostRegressor(l2_leaf_reg=1e-7)

clf_0 = CatBoostRegressor()
clf_3 = CatBoostRegressor(l2_leaf_reg=1e-3)
clf_1 = CatBoostRegressor(l2_leaf_reg=1e-1)

In [None]:
# Need to reshape to separate the different examples
if more_algos:
    clf_2.fit(X_train.reshape(-1,1), y_train, plot=True, logging_level='Silent')
    clf_5.fit(X_train.reshape(-1,1), y_train, plot=True, logging_level='Silent')
    clf_7.fit(X_train.reshape(-1,1), y_train, plot=True, logging_level='Silent')

clf_0.fit(X_train.reshape(-1,1), y_train, plot=True, logging_level='Silent')
clf_1.fit(X_train.reshape(-1,1), y_train, plot=True, logging_level='Silent')
clf_3.fit(X_train.reshape(-1,1), y_train, plot=True, logging_level='Silent')

### View prediction curve

In [None]:
plt.figure()

plt.plot(X, y, label='Reference')
plt.plot(X, clf_3.predict(X.reshape(-1,1)), label='Predictions')

plt.show()

In [None]:
plt.figure()
plt.plot(X, clf_3.predict(X.reshape(-1,1))-y, label='Reference')
plt.show()

### Test algo on useen data

### Using numpy

In [None]:
clf_3.predict(X_test.reshape(-1,1)) - y_test

In [None]:
np.square(clf_3.predict(X_test.reshape(-1,1)) - y_test).mean()

### Using sklearn

In [None]:
from sklearn.metrics import mean_squared_error
y_true = y_test
y_pred = clf_3.predict(X_test.reshape(-1,1))

mean_squared_error(y_true, y_pred)

# End of script