In [1]:
from __future__ import unicode_literals

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# 1. Preparing data - Latest 12 weeks of life of the banana plant
---

In [2]:
precipitations_l12w_df = pd.read_csv('../../../../data/raw/FincaPorvenir/Metereologico/Latest_12-weeks_May-4_August-03_2018/' \
                                 'Precipitacion_May-4_August-03_2018.csv', )

In [3]:
print(precipitations_l12w_df.shape)
precipitations_l12w_df.head()

(4397, 2)


Unnamed: 0,Fecha:,Precipitación (ml)
0,2018-05-04 00:07:38,0.0
1,2018-05-04 00:37:39,0.0
2,2018-05-04 01:07:38,0.0
3,2018-05-04 01:37:38,0.0
4,2018-05-04 02:07:38,0.0


## 1.1. We evaluate if this dataset has null type `NaN`
---

In [4]:
print(precipitations_l12w_df.isnull().any())
precipitations_l12w_df.isnull().values.any()

Fecha:                False
Precipitación (ml)    False
dtype: bool


False

Don't have null values

## 1.2. Selecting  relevant index columns features
---

Since the dataset has a column called **`Fecha:`**, which is not a numerical value,

it will be removed so that it does not interfere **with our subsequent scaling**, 

so we are only going to reference the values or samples of the column

** `Precipitación (ml)` ** and assigning them to the matrix 

`precipitations_l12w_array` created such as follow:

In [5]:
precipitations_l12w_array = precipitations_l12w_df.iloc[:,1].values

In [6]:
# Rehape the luminosity_luxes array 
precipitations_l12w_array = precipitations_l12w_array.reshape(-1, 1)
precipitations_l12w_array

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

## 1.3 Generating descriptive data to dataset
---

In [7]:
col = ['Precipitación (ml)']
precipitations_l12w_df = pd.DataFrame(precipitations_l12w_array, columns=col)
precipitations_l12w_df_describe = precipitations_l12w_df.describe()

In [8]:
precipitations_l12w_df_describe

Unnamed: 0,Precipitación (ml)
count,4397.0
mean,0.169777
std,1.566048
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,44.0


In [9]:
# Export this descriptive data to comma separated values and java script object notation
precipitations_l12w_df_describe.to_csv('../../../../data/interim/Precipitations/Latest_12-weeks_May-4_August-03_2018/' +'\n' 
                              'Precipitations_Describe_May-4_August-03_2018.cvs', sep=',', header=True, index=True)
precipitations_l12w_df_describe.to_json('../../../../data/interim/Precipitations/Latest_12-weeks_May-4_August-03_2018/' +'\n'
                                     'Precipitations_Describe_May-4_August-03_2018.json')

# 2. Creating Precipitations Training and Testing datasets

We have a **`precipitations_l12w_df`** dataset with 9750 samples rows.

- `precipitations_l12w_train`, which is the training matrix
- `precipitations_l12w_test`, the testing matrix


In [10]:
print(precipitations_l12w_df.shape)
precipitations_l12w_df.head()

(4397, 1)


Unnamed: 0,Precipitación (ml)
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [11]:
precipitations_l12w_train, precipitations_l12w_test = train_test_split(precipitations_l12w_array, test_size = 0.2)

In [12]:
# We have 7800 rows to luminosity_luxes_train
print(type(precipitations_l12w_train))
print("The dimensionality of wind_direction training dataset is: " +'\n' , precipitations_l12w_train.shape)
print('\n')

# And we have 1950 rows to luminosity_luxes_test
print(type(precipitations_l12w_test))
print("The dimensionality of wind_direction testing dataset is: " +'\n' , precipitations_l12w_test.shape)

<class 'numpy.ndarray'>
The dimensionality of wind_direction training dataset is: 
 (3517, 1)


<class 'numpy.ndarray'>
The dimensionality of wind_direction testing dataset is: 
 (880, 1)


---
##  3. Feature Scaling Precipitations training and testing dataset  
---

[This article post](http://benalexkeen.com/feature-scaling-with-scikit-learn/  "Feature Scaling with scikit-learn") it's a great reference to explore the features scaling methods
on scikit learn

- `StandardScaler` assume that data is normally distributed at the level of each characteristic or variable. If the data is not normally distributed, it is not the best alternative to use for scaling.  

- `Min-Max Scaler` it is probably the most famous scaling algorithm and what it does is resize the range to leave it in a dimension of 0 to 1 or -1 and 1 (in case there are negative values in the original dataset of input data)

This scale of maximums and minimums works best for cases where standard scaling may not work properly. If the distribution is not Gaussian or the standard deviation is very small the escalation of maximums and minimums is the best idea.
However, it is sensitive to outliers or outliers, so if there are outliers in the data it is better to consider robust scaling.

- `Robust Scaler` it is similar to the previous one of maximums and minimums, only that it uses interquartile ranges instead of maximums and minimums, which makes it robust for the outliers

- `Normalizer` which scales each value, dividing each value by its magnitude in n dimensional spaces for n number of characteristics.


---

### 3.1 We apply maximium and minimum feature scaling to Precipitations training dataset

In [13]:
# We provide a base scale range
scaler = MinMaxScaler(feature_range=(0, 1))

print("Remember our precipitations training data " + '\n', precipitations_l12w_train)

Remember our precipitations training data 
 [[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [14]:
minmax_scale_training = scaler.fit(precipitations_l12w_train.astype(float))

# transform precipitations_train data to maximum and mínimum scale value. 
precipitations_l12w_minmax_training = minmax_scale_training.transform(precipitations_l12w_train)


In [15]:
print ("And, these are our scaled data: " + '\n')
precipitations_l12w_minmax_training

And, these are our scaled data: 



array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [16]:
print('Precipitations Training dataset. Minimum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(precipitations_l12w_minmax_training[:,0].min()))

print('Precipitations Training dataset. Maximum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(precipitations_l12w_minmax_training[:,0].max()))

Precipitations Training dataset. Minimum value after MaxMinScaler:
Luminosity=0.0
Precipitations Training dataset. Maximum value after MaxMinScaler:
Luminosity=1.0


Then, our  MinMaxScaler normalized training dataset is `precipitations_l12w_minmax_training` numpy array

- We export this array to comma separated values 


In [17]:
precipitations_l12w_train_df = pd.DataFrame(precipitations_l12w_minmax_training, columns=col)

In [18]:
print(precipitations_l12w_train_df.shape)
precipitations_l12w_train_df.head()

(3517, 1)


Unnamed: 0,Precipitación (ml)
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In this way we have the dataset `precipitations_l12w_train_df` standardized and training, and export it to a .csv file

In [19]:
precipitations_l12w_train_df.to_csv('../../../../data/processed/Precipitations/Latest_12-weeks_May-4_August-03_2018/' +'\n' 
                                 'Precipitations_Normalized_TRAINING_May-4_August-03.csv', sep=',', header=True, index=False)

### 3.2 We apply maximium and minimum feature scaling to Precipitations testing dataset

In [20]:
# We provide a base scale range
scaler = MinMaxScaler(feature_range=(0, 1))

# print("Remember our precipitations testing data " + '\n', precipitations_l12w_test)

In [21]:
minmax_scale_test = scaler.fit(precipitations_l12w_test.astype(float))
# transform luminosity_luxes_test data to maximum and mínimum scale value. 
precipitations_l12w_minmax_test = minmax_scale_test.transform(precipitations_l12w_test)

In [22]:
print ("And, these are our testing scaled data: " + '\n')
# precipitations_l12w_minmax_test

And, these are our testing scaled data: 



In [23]:
print('Precipitations Testing dataset. Minimum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(precipitations_l12w_minmax_test[:,0].min()))

print('Precipitations Testing dataset. Maximum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(precipitations_l12w_minmax_test[:,0].max()))

Precipitations Testing dataset. Minimum value after MaxMinScaler:
Luminosity=0.0
Precipitations Testing dataset. Maximum value after MaxMinScaler:
Luminosity=1.0


Then, our  MinMaxScaler normalized training dataset is `precipitations_l12w_minmax_test` numpy array

- We export this array to comma separated values

In [24]:
precipitations_l12w_test_df =pd.DataFrame(precipitations_l12w_minmax_test, columns=col)
precipitations_l12w_test_df.head()

Unnamed: 0,Precipitación (ml)
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [25]:
precipitations_l12w_test_df.to_csv('../../../../data/processed/Precipitations/Latest_12-weeks_May-4_August-03_2018/' +'\n' 
                                 'Precipitations_Normalized_TESTING_JMay-4_August-03_2018.csv', sep=',', header=True, index=False)