In [1]:
from __future__ import unicode_literals

import pandas as pd
import numpy as np
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

# 1. Preparing data - latest 12 weeks of life of the banana plant 
---

We load the phreatic level datasets which compose the  time series from May 4th to August 03rd 

In [3]:
phreatic_level_l12w = pd.read_csv('../../../../data/raw/FincaPorvenir/Drenajes/DatosNivelFreatico/Latest_12-weeks_May-4_August-03_2018/' \
                                 'From_2018-05-04_00-00-00_To_2018-08-03_23-59-59.csv', )

In [5]:
print("THE DIMENSIONALITY IS: " +'\n')
print(phreatic_level_l12w.shape)
# Numero de columnas
print("Columns number", len(phreatic_level_l12w.columns))
# Numero de registros de la columna PORVL10N1
print("PORVL10N1 column rows number",len(phreatic_level_l12w.PORVL10N1))
phreatic_level_l12w.head()

THE DIMENSIONALITY IS: 

(2226, 76)
Columns number 76
PORVL10N1 column rows number 2226


Unnamed: 0,Fecha,Hora,PORVL2N1,Fecha.1,Hora.1,PORVL2N2,Fecha.2,Hora.2,PORVL4N1,Fecha.3,...,PORVL21N4,Fecha.22,Hora.22,PORVL21N5,Fecha.23,Hora.23,PORVL24N1,Fecha.24,Hora.24,PORVL24N2
0,2018-05-04,01:01:35,0.65,2018-05-04,01:59:54,0.57,2018-05-04,01:22:55,0.76,2018-05-04,...,0.73,2018-05-04,01:04:23,0.88,2018-05-04,01:16:08,0.64,2018-05-04,01:19:34,0.94
1,2018-05-04,02:01:34,0.66,2018-05-04,02:59:55,0.59,2018-05-04,02:22:55,0.78,2018-05-04,...,0.74,2018-05-04,02:04:16,0.89,2018-05-04,02:17:17,0.54,2018-05-04,02:19:34,0.95
2,2018-05-04,03:01:36,0.68,2018-05-04,03:59:55,0.6,2018-05-04,03:22:54,0.79,2018-05-04,...,0.75,2018-05-04,03:04:20,0.9,2018-05-04,03:15:58,0.54,2018-05-04,03:20:13,0.97
3,2018-05-04,04:02:05,0.69,2018-05-04,04:59:55,0.61,2018-05-04,04:22:55,0.8,2018-05-04,...,0.76,2018-05-04,04:04:19,0.91,2018-05-04,04:16:00,0.55,2018-05-04,04:19:35,0.98
4,2018-05-04,05:01:35,0.7,2018-05-04,05:59:55,0.62,2018-05-04,05:22:55,0.81,2018-05-04,...,0.77,2018-05-04,05:04:18,0.92,2018-05-04,05:15:58,0.55,2018-05-04,05:19:36,0.99


## 1.1. Selecting  relevant index columns features
---

In [6]:
# We capture just the columns of the phreatic_level_l12w dataframe 
# cols = list(phreatic_level_l12w.columns)
# print("The columns are: " + "\n\n", cols)

# We iterate by the columns list cols to get only of them that start with the prefix
# PORVL. We enumerate them and assing to pos_col variable
# pos_cols = [i for i, word  in enumerate(cols) if word.startswith('PORVL')]
# print(pos_cols)

Since the dataset has a columns called **`Fecha:`** and **`Hora:`** to each lot, which is not a numerical 

values, it will be removed so that it does not interfere **with our subsequent scaling**, 

so we are only going to reference the phreatic level values or samples of the ** `PORVLxNx` ** columns and 

assigning them to the dataframe `phreatic_level_l12w` created such as follow:

- We use the [filter](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.filter.html) method to extract only the colums subset which names start with the ** `PORVLN` ** criteria 
 

In [8]:
phreatic_level_lots_nodes_l12w = phreatic_level_l12w.filter(regex=('PORVL.*'))
phreatic_level_lots_nodes_l12w

Unnamed: 0,PORVL2N1,PORVL2N2,PORVL4N1,PORVL5N1,PORVL6N1,PORVL7N1,PORVL8N1,PORVL9N1,PORVL10N1,PORVL13N1,...,PORVL18N2,PORVL18N3,PORVL18N4,PORVL21N1,PORVL21N2,PORVL21N3,PORVL21N4,PORVL21N5,PORVL24N1,PORVL24N2
0,0.65,0.57,0.76,0.46,1.07,0.75,0.99,0.90,0.79,0.79,...,0.54,0.80,1.00,1.11,0.76,0.74,0.73,0.88,0.64,0.94
1,0.66,0.59,0.78,0.47,1.06,0.77,1.00,0.91,0.80,0.80,...,0.55,0.81,1.00,1.09,0.77,0.74,0.74,0.89,0.54,0.95
2,0.68,0.60,0.79,0.47,1.05,0.78,1.01,0.91,0.80,0.80,...,0.57,0.82,1.01,1.09,0.78,0.74,0.75,0.90,0.54,0.97
3,0.69,0.61,0.80,0.47,1.05,0.80,1.02,0.92,0.81,0.81,...,0.58,0.83,1.01,1.10,0.79,0.74,0.76,0.91,0.55,0.98
4,0.70,0.62,0.81,0.47,1.05,0.81,1.02,0.93,0.82,0.82,...,0.59,0.84,1.02,1.10,0.79,0.74,0.77,0.92,0.55,0.99
5,0.71,0.63,0.83,0.49,1.07,0.82,1.03,0.93,0.82,0.83,...,0.60,0.85,1.02,1.11,0.80,0.74,0.78,0.93,0.57,1.01
6,0.72,0.64,0.84,0.50,1.08,0.86,1.04,0.94,0.83,0.84,...,0.61,0.86,1.03,1.12,0.81,0.75,0.79,0.94,0.59,1.01
7,0.73,0.66,0.84,0.50,1.10,0.91,1.05,0.95,0.83,0.85,...,0.63,0.87,1.03,1.12,0.81,0.64,0.80,0.95,0.59,1.02
8,0.74,0.67,0.86,0.50,1.11,0.96,1.06,0.95,0.84,0.86,...,0.64,0.88,1.04,1.13,0.82,0.64,0.80,0.96,0.59,1.11
9,0.75,0.68,0.86,0.50,1.13,1.01,1.06,0.96,0.84,0.87,...,0.65,0.89,1.04,1.13,0.83,0.64,0.81,0.97,0.59,1.12


## 1.2 Filling **`NaN:`** values
---

We evaluate if **`phreatic_level_lots_nodes_l12w`** dataframe has null type `NaN`

Have null values.

In [9]:
print(phreatic_level_lots_nodes_l12w.isnull().any())
phreatic_level_lots_nodes_l12w.isnull().values.any()

PORVL2N1      True
PORVL2N2      True
PORVL4N1      True
PORVL5N1      True
PORVL6N1      True
PORVL7N1      True
PORVL8N1      True
PORVL9N1      True
PORVL10N1    False
PORVL13N1     True
PORVL14N1     True
PORVL15N1     True
PORVL16N1     True
PORVL16N2     True
PORVL18N1     True
PORVL18N2     True
PORVL18N3     True
PORVL18N4     True
PORVL21N1     True
PORVL21N2     True
PORVL21N3     True
PORVL21N4     True
PORVL21N5     True
PORVL24N1     True
PORVL24N2    False
dtype: bool


True

We will use as a strategy, fill in the missing data with the average of the other values in the column where data is missing.

`axis` is the axis along which we are going to do the operation of` Imputer`, by default it is equal to 0, which means that this operation will be along the columns

In [10]:
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)

# We adjust the values transforming the columns where you need data 
# with the fit() method. 
# We pass the dataframe phreatic_level_Jan12_Apr12_lots_nodes and 
# with the variables where I can find the empty values, in this case 
# all variables [::] and fit 
imputer = imputer.fit(phreatic_level_lots_nodes_l12w[::])
phreatic_level_lots_nodes_l12w = imputer.transform(phreatic_level_lots_nodes_l12w[::])

The previous step generate the ** `phreatic_level_lots_nodes_l12w` ** array which don't have null values

In [11]:
phreatic_level_lots_nodes_l12w

array([[0.65      , 0.57      , 0.76      , ..., 0.88      , 0.64      ,
        0.94      ],
       [0.66      , 0.59      , 0.78      , ..., 0.89      , 0.54      ,
        0.95      ],
       [0.68      , 0.6       , 0.79      , ..., 0.9       , 0.54      ,
        0.97      ],
       ...,
       [0.76175645, 0.76317833, 0.89632699, ..., 1.12865523, 0.81245797,
        1.15      ],
       [0.76175645, 0.76317833, 0.89632699, ..., 1.12865523, 0.81245797,
        1.16      ],
       [0.76175645, 0.76317833, 0.89632699, ..., 1.12865523, 0.81245797,
        1.17      ]])

In [12]:
print(phreatic_level_lots_nodes_l12w.shape)
np.isnan(phreatic_level_lots_nodes_l12w)

(2226, 25)


array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

- Turn the ** `phreatic_level_lots_nodes_l12w` ** array to dataframe

In [14]:
# We check the column names dataset 
phreatic_level_l12w.columns.get_values()[:]

array(['Fecha', 'Hora', 'PORVL2N1', 'Fecha.1', 'Hora.1', 'PORVL2N2',
       'Fecha.2', 'Hora.2', 'PORVL4N1', 'Fecha.3', 'Hora.3', 'PORVL5N1',
       'Fecha.4', 'Hora.4', 'PORVL6N1', 'Fecha.5', 'Hora.5', 'PORVL7N1',
       'Fecha.6', 'Hora.6', 'PORVL8N1', 'Fecha.7', 'Hora.7', 'PORVL9N1',
       'Fecha.8', 'Hora.8', 'PORVL10N1', 'Fecha.9', 'Hora.9', 'PORVL13N1',
       'Fecha.10', 'Hora.10', 'PORVL14N1', 'Fecha.11', 'Hora.11',
       'PORVL15N1', 'Fecha.12', 'Hora.12', 'PORVL16N1', 'Fecha.13',
       'Hora.13', 'PORVL16N2', 'Fecha.14', 'Hora.14', 'PORVL18N1',
       'Fecha.15', 'Hora.15', 'PORVL18N2', 'Fecha.16', 'Hora.16',
       'PORVL18N3', 'Unnamed: 51', 'Fecha.17', 'Hora.17', 'PORVL18N4',
       'Fecha.18', 'Hora.18', 'PORVL21N1', 'Fecha.19', 'Hora.19',
       'PORVL21N2', 'Fecha.20', 'Hora.20', 'PORVL21N3', 'Fecha.21',
       'Hora.21', 'PORVL21N4', 'Fecha.22', 'Hora.22', 'PORVL21N5',
       'Fecha.23', 'Hora.23', 'PORVL24N1', 'Fecha.24', 'Hora.24',
       'PORVL24N2'], dtype=objec

In [15]:
# We refer the columns of interest into col list, to apply the new dataframe
# which are in where there are the phreatic level values
cols =  ['PORVL2N1', 'PORVL2N2', 'PORVL4N1', 'PORVL5N1', 'PORVL6N1', 'PORVL7N1', 'PORVL8N1', 'PORVL9N1', 'PORVL10N1', 
         'PORVL13N1', 'PORVL14N1', 'PORVL15N1', 'PORVL16N1', 'PORVL16N2', 'PORVL18N1', 'PORVL18N2', 'PORVL18N3', 
         'PORVL18N4', 'PORVL21N1', 'PORVL21N2', 'PORVL21N3', 'PORVL21N4', 'PORVL21N5', 'PORVL24N1', 'PORVL24N2']
# index = ['Row'+ str(i) for i in range(1, len(numpy_nivel_freatico)+1)]
# El automaticamente toma el indice para los registros que es cada numero de registro
phreatic_level_lots_nodes_l12w_df = pd.DataFrame(data=phreatic_level_lots_nodes_l12w, columns=cols)

In [16]:
phreatic_level_lots_nodes_l12w_df.head()

print("Have null values? \ ", phreatic_level_lots_nodes_l12w_df.isnull().values.any())
phreatic_level_lots_nodes_l12w_df.isnull().values.any()

Don't have null values \  False


False

## 1.3 Generating descriptive data to dataset
---

In [18]:
phreatic_level_lots_nodes_l12w_describe = phreatic_level_lots_nodes_l12w_df.describe(include='all')

In [19]:
phreatic_level_lots_nodes_l12w_describe

Unnamed: 0,PORVL2N1,PORVL2N2,PORVL4N1,PORVL5N1,PORVL6N1,PORVL7N1,PORVL8N1,PORVL9N1,PORVL10N1,PORVL13N1,...,PORVL18N2,PORVL18N3,PORVL18N4,PORVL21N1,PORVL21N2,PORVL21N3,PORVL21N4,PORVL21N5,PORVL24N1,PORVL24N2
count,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,...,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0,2226.0
mean,0.761756,0.763178,0.896327,0.641683,1.410667,1.074166,1.091391,0.939778,0.973414,0.968387,...,0.871736,1.200351,1.224299,1.082222,0.938562,0.867892,1.01063,1.128655,0.812458,1.19624
std,0.166723,0.186391,0.162937,0.151788,0.281124,0.245141,0.170596,0.16434,0.156073,0.156285,...,0.219623,0.208706,0.137803,0.109536,0.154268,0.173257,0.232429,0.203111,0.258961,0.192348
min,0.12,0.06,0.09,0.06,0.19,0.12,0.12,0.06,0.1,0.08,...,0.1,0.29,0.62,0.36,0.09,0.14,0.22,0.15,0.14,0.37
25%,0.71,0.7,0.85,0.53,1.32,0.95,1.05,0.83,0.89,0.9,...,0.76,1.1,1.13,1.03,0.86,0.79,0.9,1.03,0.63,1.12
50%,0.81,0.78,0.96,0.66,1.42,1.15,1.13,0.95,0.95,0.98,...,0.9,1.24,1.224299,1.07,0.94,0.88,1.0,1.17,0.83,1.23
75%,0.87,0.87,1.0,0.75,1.67,1.24,1.19,1.07,1.06,1.07,...,1.04,1.33,1.32,1.16,1.06,1.0,1.2,1.25,1.0,1.33
max,1.02,1.23,1.09,0.94,1.68,1.54,1.31,1.22,1.32,1.22,...,1.36,1.58,1.49,1.47,1.18,1.22,1.47,1.46,1.3,1.49


In [20]:
# Export this descriptive data to comma separated values and java script object notation
phreatic_level_lots_nodes_l12w_describe.to_csv('../../../../data/interim/PhreaticLevel/Latest_12-weeks_May-4_August-03_2018/' +'\n' 
                              'Phreatic_Level_Describe_May-4_August-03_2018.cvs', sep=',', header=True, index=True)
phreatic_level_lots_nodes_l12w_describe.to_json('../../../../data/interim/PhreaticLevel/Latest_12-weeks_May-4_August-03_2018/' +'\n'
                                     'Phreatic_Level_Describe_May-4_August-03_2018.json')

## 2. Creating Phreatic Level Training and Testing datasets

We have a **`phreatic_level_lots_nodes_l12w_df`** dataset with 2226 samples rows.


In [21]:
print(phreatic_level_lots_nodes_l12w_df.shape)
phreatic_level_lots_nodes_l12w_df.head()

(2226, 25)


Unnamed: 0,PORVL2N1,PORVL2N2,PORVL4N1,PORVL5N1,PORVL6N1,PORVL7N1,PORVL8N1,PORVL9N1,PORVL10N1,PORVL13N1,...,PORVL18N2,PORVL18N3,PORVL18N4,PORVL21N1,PORVL21N2,PORVL21N3,PORVL21N4,PORVL21N5,PORVL24N1,PORVL24N2
0,0.65,0.57,0.76,0.46,1.07,0.75,0.99,0.9,0.79,0.79,...,0.54,0.8,1.0,1.11,0.76,0.74,0.73,0.88,0.64,0.94
1,0.66,0.59,0.78,0.47,1.06,0.77,1.0,0.91,0.8,0.8,...,0.55,0.81,1.0,1.09,0.77,0.74,0.74,0.89,0.54,0.95
2,0.68,0.6,0.79,0.47,1.05,0.78,1.01,0.91,0.8,0.8,...,0.57,0.82,1.01,1.09,0.78,0.74,0.75,0.9,0.54,0.97
3,0.69,0.61,0.8,0.47,1.05,0.8,1.02,0.92,0.81,0.81,...,0.58,0.83,1.01,1.1,0.79,0.74,0.76,0.91,0.55,0.98
4,0.7,0.62,0.81,0.47,1.05,0.81,1.02,0.93,0.82,0.82,...,0.59,0.84,1.02,1.1,0.79,0.74,0.77,0.92,0.55,0.99


We'll divide it into two differents datasets:

- Training dataset
- Testing dataset

This is executed through [train_test_split](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html  "sklearn.model_selection.train_test_split") function of this way:

`train_test_split` receive as a data parameter a numpy array, we have to turn the 

**`phreatic_level_lots_nodes_l12w_df`** dataframe to numpy array such as follow:

In [22]:
# numpy_phreatic_level_lots_nodes_l12w = phreatic_level_lots_nodes_l12w_df.reset_index().values
numpy_phreatic_level_lots_nodes_l12w = phreatic_level_lots_nodes_l12w_df.values

In [23]:
# My numpy_phreatic_level_lots_nodes variable now is a numpy array
print(numpy_phreatic_level_lots_nodes_l12w.shape)
print(type(numpy_phreatic_level_lots_nodes_l12w))
numpy_phreatic_level_lots_nodes_l12w

(2226, 25)
<class 'numpy.ndarray'>


array([[0.65      , 0.57      , 0.76      , ..., 0.88      , 0.64      ,
        0.94      ],
       [0.66      , 0.59      , 0.78      , ..., 0.89      , 0.54      ,
        0.95      ],
       [0.68      , 0.6       , 0.79      , ..., 0.9       , 0.54      ,
        0.97      ],
       ...,
       [0.76175645, 0.76317833, 0.89632699, ..., 1.12865523, 0.81245797,
        1.15      ],
       [0.76175645, 0.76317833, 0.89632699, ..., 1.12865523, 0.81245797,
        1.16      ],
       [0.76175645, 0.76317833, 0.89632699, ..., 1.12865523, 0.81245797,
        1.17      ]])

We compose the following datasets from **`numpy_phreatic_level_lots_nodes_l12w`** array :

- `numpy_phreatic_level_lots_nodes_l12w_train`, which is the training matrix
- `numpy_phreatic_level_lots_nodes_l12w_test`, the testing matrix

In [27]:
numpy_phreatic_level_lots_nodes_l12w_train, numpy_phreatic_level_lots_nodes_l12w_test = train_test_split(numpy_phreatic_level_lots_nodes_l12w, test_size = 0.2, random_state = 0)

In [28]:
# We have 3904 rows to luminosity_luxes_train
print(type(numpy_phreatic_level_lots_nodes_l12w_train))
print("The dimensionality of wind_direction training dataset is: " +'\n' , numpy_phreatic_level_lots_nodes_l12w_train.shape)
print('\n')

# And we have 976 rows to luminosity_luxes_test
print(type(numpy_phreatic_level_lots_nodes_l12w_test))
print("The dimensionality of wind_direction testing dataset is: " +'\n' , numpy_phreatic_level_lots_nodes_l12w_test.shape)

<class 'numpy.ndarray'>
The dimensionality of wind_direction training dataset is: 
 (1780, 25)


<class 'numpy.ndarray'>
The dimensionality of wind_direction testing dataset is: 
 (446, 25)


---
#  3. Feature Scaling Luminosity training and testing dataset  
---

[This article post](http://benalexkeen.com/feature-scaling-with-scikit-learn/  "Feature Scaling with scikit-learn") it's a great reference to explore the features scaling methods
on scikit learn

- `StandardScaler` assume that data is normally distributed at the level of each characteristic or variable. If the data is not normally distributed, it is not the best alternative to use for scaling.  

- `Min-Max Scaler` it is probably the most famous scaling algorithm and what it does is resize the range to leave it in a dimension of 0 to 1 or -1 and 1 (in case there are negative values in the original dataset of input data)

This scale of maximums and minimums works best for cases where standard scaling may not work properly. If the distribution is not Gaussian or the standard deviation is very small the escalation of maximums and minimums is the best idea.
However, it is sensitive to outliers or outliers, so if there are outliers in the data it is better to consider robust scaling.

- `Robust Scaler` it is similar to the previous one of maximums and minimums, only that it uses interquartile ranges instead of maximums and minimums, which makes it robust for the outliers

- `Normalizer` which scales each value, dividing each value by its magnitude in n dimensional spaces for n number of characteristics.


---

We will use the scaling of maximums and minimums to scale the phreatic level data, because the standard deviation is very small, 

it does not have atypical values and it does not follow a normal distribution (you have to check this)

We apply the maximum and minimum scaling. We provide a rank or base scale that **will be between 0 and 1** using an object,

[MinMaxScaler](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html "sklearn.preprocessing.MinMaxScaler") which transforms each characteristic, (in this 

case it will be the columns feature of `PORVL.*`) individually according to a given range.

Product of its applicability, generates these attributes in the dataset, already transformed: 

![alt text](https://cldup.com/lTIv4HXgTk-3000x3000.png "sklearn.preprocessing.MinMaxScaler")

## 3.1 We apply maximium and minimum feature scaling to Phreatic level training dataset

In [29]:
# We provide a base scale range
scaler = MinMaxScaler(feature_range=(0, 1))

print("Remember our phreatic level training data " + '\n', numpy_phreatic_level_lots_nodes_l12w_train)

Remember our phreatic level training data 
 [[0.53 0.39 0.59 ... 1.21 0.45 1.29]
 [0.52 0.29 0.59 ... 1.21 0.43 1.29]
 [0.56 0.44 0.77 ... 0.71 0.58 1.23]
 ...
 [0.95 0.83 1.04 ... 1.46 1.17 1.42]
 [1.01 0.87 0.38 ... 1.34 0.56 1.46]
 [0.68 0.72 0.77 ... 0.68 0.61 0.55]]


With the [fit](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler.fit  "MinMaxScaler.fit")
we compute the maximum and minimum value of  `numpy_phreatic_level_lots_nodes_l12w_train` dataset to be used in the subsequent scaling 

We assing these values to `minmax_scale_training` variable.

In [30]:
minmax_scale_training = scaler.fit(numpy_phreatic_level_lots_nodes_l12w_train.astype(float))
# print(minmax_scale_training.data_max_)
# http://terrapinssky.blogspot.com/2017/10/pythonresolved-dataconversionwarning.html

Then, we apply the [transform](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler.transform "MinMaxScaler.transform") method to transform these data to maximum and mínimum scale value. 

Here, with this process, the `numpy_phreatic_level_lots_nodes_l12w_train` data are scaled between **0 to 1**  selected range 

In [31]:
# transform luminosity_luxes_train data to maximum and mínimum scale value. 
phreatic_level_lots_nodes_l12w_minmax_training = minmax_scale_training.transform(numpy_phreatic_level_lots_nodes_l12w_train)

In [32]:
print ("And, these are our scaled data: " + '\n')
phreatic_level_lots_nodes_l12w_minmax_training

And, these are our scaled data: 



array([[0.45555556, 0.27826087, 0.5       , ..., 0.80916031, 0.26956522,
        0.82142857],
       [0.44444444, 0.19130435, 0.5       , ..., 0.80916031, 0.25217391,
        0.82142857],
       [0.48888889, 0.32173913, 0.68      , ..., 0.42748092, 0.3826087 ,
        0.76785714],
       ...,
       [0.92222222, 0.66086957, 0.95      , ..., 1.        , 0.89565217,
        0.9375    ],
       [0.98888889, 0.69565217, 0.29      , ..., 0.90839695, 0.36521739,
        0.97321429],
       [0.62222222, 0.56521739, 0.68      , ..., 0.40458015, 0.40869565,
        0.16071429]])

In [33]:
print('Phreatic Level Training dataset. Minimum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(phreatic_level_lots_nodes_l12w_minmax_training[:,0].min()))

print('Phreatic Level Training dataset. Maximum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(phreatic_level_lots_nodes_l12w_minmax_training[:,0].max()))

Phreatic Level Training dataset. Minimum value after MaxMinScaler:
Luminosity=0.0
Phreatic Level Training dataset. Maximum value after MaxMinScaler:
Luminosity=1.0


Then, our  MinMaxScaler normalized training dataset is **`phreatic_level_lots_nodes_l12w_minmax_training`** numpy array

- We export this array to comma separated values 

In [34]:
phreatic_level_lots_nodes_l12w_train_df = pd.DataFrame(phreatic_level_lots_nodes_l12w_minmax_training, columns=cols)

In [35]:
print(phreatic_level_lots_nodes_l12w_train_df.shape)
phreatic_level_lots_nodes_l12w_train_df.head()

(1780, 25)


Unnamed: 0,PORVL2N1,PORVL2N2,PORVL4N1,PORVL5N1,PORVL6N1,PORVL7N1,PORVL8N1,PORVL9N1,PORVL10N1,PORVL13N1,...,PORVL18N2,PORVL18N3,PORVL18N4,PORVL21N1,PORVL21N2,PORVL21N3,PORVL21N4,PORVL21N5,PORVL24N1,PORVL24N2
0,0.455556,0.278261,0.5,0.693182,0.818792,0.753521,0.907563,0.508621,0.827869,0.709091,...,0.433333,0.674419,0.758621,0.495495,0.981308,0.712963,0.632,0.80916,0.269565,0.821429
1,0.444444,0.191304,0.5,0.693182,0.818792,0.760563,0.907563,0.517241,0.827869,0.718182,...,0.425,0.682171,0.747126,0.495495,0.981308,0.712963,0.632,0.80916,0.252174,0.821429
2,0.488889,0.321739,0.68,0.045455,0.657718,0.514085,0.756303,0.517241,0.942623,0.590909,...,0.408333,0.674419,0.574713,0.531532,0.774357,0.37037,0.32,0.427481,0.382609,0.767857
3,0.244444,0.608696,0.69,0.579545,0.771812,0.302817,0.361345,0.62069,0.770492,0.636364,...,0.075,0.589147,0.666667,0.540541,0.682243,0.675926,0.448,0.656489,0.078261,0.857143
4,0.8,0.86087,0.89,0.818182,0.818792,0.809859,0.890756,0.741379,0.786885,0.836364,...,0.741667,0.75969,0.770115,0.738739,0.607477,0.703704,0.704,0.763359,0.391304,0.830357


In this way we have the dataset `phreatic_level_lots_nodes_l12w_train_df` standardized and training, and export it to a .csv file

In [36]:
phreatic_level_lots_nodes_l12w_train_df.to_csv('../../../../data/processed/PhreaticLevel/Latest_12-weeks_May-4_August-03_2018/' +'\n' 
                                 'Phreatic-Level_Normalized_TRAINING_May-4_August-03_2018.csv', sep=',', header=True, index=False)

## 3.2 We apply maximium and minimum feature scaling to phreatic level testing dataset

In [37]:
print("Remember our phreatic level testing data " + '\n', numpy_phreatic_level_lots_nodes_l12w_test)

Remember our phreatic level testing data 
 [[0.68 0.66 0.78 ... 1.09 0.54 0.39]
 [0.9  1.   1.04 ... 1.3  1.19 1.4 ]
 [0.83 0.89 0.99 ... 1.16 0.87 1.24]
 ...
 [0.82 0.73 0.96 ... 1.4  1.04 1.36]
 [0.84 1.03 0.99 ... 1.15 0.57 1.3 ]
 [0.86 0.93 1.   ... 1.18 0.92 1.28]]


In [38]:
minmax_scale_test = scaler.fit(numpy_phreatic_level_lots_nodes_l12w_test.astype(float))
# transform phreatic level data to maximum and mínimum scale value. 
phreatic_level_lots_nodes_l12w_minmax_test = minmax_scale_test.transform(numpy_phreatic_level_lots_nodes_l12w_test)

In [39]:
print ("And, these are our testing scaled data: " + '\n')
phreatic_level_lots_nodes_l12w_minmax_test

And, these are our testing scaled data: 



array([[0.60465116, 0.51282051, 0.63953488, ..., 0.64423077, 0.32142857,
        0.        ],
       [0.86046512, 0.8034188 , 0.94186047, ..., 0.84615385, 0.90178571,
        0.93518519],
       [0.77906977, 0.70940171, 0.88372093, ..., 0.71153846, 0.61607143,
        0.78703704],
       ...,
       [0.76744186, 0.57264957, 0.84883721, ..., 0.94230769, 0.76785714,
        0.89814815],
       [0.79069767, 0.82905983, 0.88372093, ..., 0.70192308, 0.34821429,
        0.84259259],
       [0.81395349, 0.74358974, 0.89534884, ..., 0.73076923, 0.66071429,
        0.82407407]])

In [40]:
print('Phreatic Level Testing dataset. Minimum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(phreatic_level_lots_nodes_l12w_minmax_test[:,0].min()))

print('Phreatic Level Testing dataset. Maximum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(phreatic_level_lots_nodes_l12w_minmax_test[:,0].max()))

Phreatic Level Testing dataset. Minimum value after MaxMinScaler:
Luminosity=0.0
Phreatic Level Testing dataset. Maximum value after MaxMinScaler:
Luminosity=1.0


Then, our  MinMaxScaler normalized training dataset is `phreatic_level_lots_nodes_l12w_minmax_test` numpy array

- We export this array to comma separated values

In [43]:
phreatic_level_lots_nodes_l12w_test_df = pd.DataFrame(phreatic_level_lots_nodes_l12w_minmax_test, columns=cols)

In [44]:
print(phreatic_level_lots_nodes_l12w_test_df.shape)
phreatic_level_lots_nodes_l12w_test_df.head()

(446, 25)


Unnamed: 0,PORVL2N1,PORVL2N2,PORVL4N1,PORVL5N1,PORVL6N1,PORVL7N1,PORVL8N1,PORVL9N1,PORVL10N1,PORVL13N1,...,PORVL18N2,PORVL18N3,PORVL18N4,PORVL21N1,PORVL21N2,PORVL21N3,PORVL21N4,PORVL21N5,PORVL24N1,PORVL24N2
0,0.604651,0.512821,0.639535,0.488636,0.80597,0.281481,0.591304,0.442105,0.377778,0.640351,...,0.487805,0.361905,0.670588,0.447059,0.719626,0.685714,0.540984,0.644231,0.321429,0.0
1,0.860465,0.803419,0.94186,0.511364,0.223881,0.866667,0.973913,0.8,0.988889,0.95614,...,0.804878,0.771429,0.552941,0.764706,0.682243,0.704762,0.598361,0.846154,0.901786,0.935185
2,0.77907,0.709402,0.883721,0.727273,0.977612,0.718519,0.834783,0.642105,0.611111,0.842105,...,0.674797,0.685714,0.823529,0.576471,0.906542,0.771429,0.803279,0.711538,0.616071,0.787037
3,0.790698,0.615385,0.883721,0.727273,1.0,0.740741,0.886957,0.863158,0.644444,0.798246,...,0.642276,0.771429,0.835294,0.623529,0.906542,0.828571,0.754098,0.807692,0.660714,0.805556
4,0.790698,0.991453,0.872093,0.806818,0.791045,0.733333,0.852174,0.673684,0.622222,0.833333,...,0.707317,0.87619,0.717647,0.717647,0.570093,0.638095,0.786885,0.721154,0.767857,0.787037


In [45]:
phreatic_level_lots_nodes_l12w_test_df.to_csv('../../../../data/processed/PhreaticLevel/Latest_12-weeks_May-4_August-03_2018/' +'\n' 
                                 'Phreatic-Level_Normalized_TESTING_May-4_August-03.csv', sep=',', header=True, index=False)