In [46]:
from __future__ import unicode_literals

import pandas as pd
import numpy as np
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

# 1. Preparing data - Latest 12 weeks of life of the banana plant
---

In [47]:
luminosity_lw12 = pd.read_csv('../../../../data/raw/FincaPorvenir/Metereologico/Latest_12-weeks_May-4_August-03_2018/' \
                                 'Luz_May-4_August-03_2018.csv', )

In [48]:
print(luminosity_lw12.shape)
luminosity_lw12.head()

(4398, 2)


Unnamed: 0,Fecha:,Luz (lux)
0,2018-05-04 00:07:38,1
1,2018-05-04 00:37:39,1
2,2018-05-04 01:07:38,1
3,2018-05-04 01:37:38,1
4,2018-05-04 02:07:38,1


## 1.1. We evaluate if this dataset has null type `NaN`
---

In [49]:
print(luminosity_lw12.isnull().any())
luminosity_lw12.isnull().values.any()

Fecha:       False
Luz (lux)    False
dtype: bool


False

Don't have null values

## 1.2. Selecting  relevant index columns features
---

Since the dataset has a column called **`Fecha:`**, which is not a numerical value,

it will be removed so that it does not interfere **with our subsequent scaling**, 

so we are only going to reference the values or samples of the column

** `Luz (lux)` ** and assigning them to the matrix 

`luminosity_luxes_l12w` created such as follow:


In [50]:
luminosity_luxes_l12w = luminosity_lw12.iloc[:, 1].values

print(type(luminosity_luxes_l12w))
# When we use iloc() dataframe function, the variable is turned on numpy array

luminosity_luxes_l12w

<class 'numpy.ndarray'>


array([1, 1, 1, ..., 1, 1, 1])

In [51]:
# Rehape the luminosity_luxes array 
luminosity_luxes_l12w = luminosity_luxes_l12w.reshape(-1,1)

luminosity_luxes_l12w

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

## 1.3 Generating descriptive data to dataset
---

In [52]:
col = ['Luz (lux)']
luminosity_luxes_l12w_df = pd.DataFrame(luminosity_luxes_l12w, columns=col)
luminosity_luxes_l12w_df_describe = luminosity_luxes_l12w_df.describe()

In [53]:
luminosity_luxes_l12w_df_describe

Unnamed: 0,Luz (lux)
count,4398.0
mean,10973.295134
std,14654.471959
min,1.0
25%,1.0
50%,990.0
75%,21192.0
max,53280.0


In [54]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
# Export this descriptive data to comma separated values and java script object notation
luminosity_luxes_l12w_df_describe.to_csv('../../../../data/interim/Luminosity/Latest_12-weeks_May-4_August-03_2018/' +'\n' 
                              'Luminosity_Describe_May-4_August-03.cvs', sep=',', header=True, index=True)
luminosity_luxes_l12w_df_describe.to_json('../../../../data/interim/Luminosity/Latest_12-weeks_May-4_August-03_2018/' +'\n'
                                     'Luminosity_Describe_May-4_August-03.json')

## 2. Creating luminosity Training and Testing datasets

We have a **`luminosity_luxes_l12w_df`** dataset with 4398 samples rows.

In [55]:
print(luminosity_luxes_l12w_df.shape)
luminosity_luxes_l12w_df.head()

(4398, 1)


Unnamed: 0,Luz (lux)
0,1
1,1
2,1
3,1
4,1


We'll divide it into two differents datasets:

- Training dataset
- Testing dataset

This is executed through [train_test_split](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html  "sklearn.model_selection.train_test_split") function of this way:

`train_test_split` receive as a data parameter a numpy array, we have to turn the 

**`luminosity_luxes_l12w_df`** dataframe to numpy array such as follow:

In [56]:
# numpy_luminosity_luxes_l12w = luminosity_luxes_l12w_df.reset_index().values
numpy_luminosity_luxes_l12w = luminosity_luxes_l12w_df.values

# My numpy_luminosity_luxes_l12w variable now is a numpy array
print(numpy_luminosity_luxes_l12w.shape)
print(type(numpy_luminosity_luxes_l12w))
numpy_luminosity_luxes_l12w

(4398, 1)
<class 'numpy.ndarray'>


array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

We compose the following datasets from **`numpy_luminosity_luxes`** array :

- `luminosity_luxes_l12w_train`, which is the training matrix
- `luminosity_luxes_l12w_test`, the testing matrix

We use the [train_test_split](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html  "sklearn.model_selection.train_test_split") function to create the training and testing dataset.

### 2.1  Creating Training and testing luminosity datasets
---

- `luminosity_luxes_l12w_train`, which is the training matrix
- `luminosity_luxes_l12w_test`, the testing matrix

In [57]:
luminosity_luxes_l12w_train, luminosity_luxes_l12w_test = train_test_split(numpy_luminosity_luxes_l12w, test_size = 0.2)

# We have 3518 rows to luminosity_luxes_train
print(type(luminosity_luxes_l12w_train))
print("The dimensionality of luminosity training dataset is: " +'\n' , luminosity_luxes_l12w_train.shape)
print('\n')

# And we have 880 rows to luminosity_luxes_test
print(type(luminosity_luxes_l12w_test))
print("The dimensionality of luminosity testing dataset is: " +'\n' , luminosity_luxes_l12w_test.shape)

<class 'numpy.ndarray'>
The dimensionality of luminosity training dataset is: 
 (3518, 1)


<class 'numpy.ndarray'>
The dimensionality of luminosity testing dataset is: 
 (880, 1)


---
##  3. Feature Scaling Luminosity training and testing dataset  
---

e will use the scaling of maximums and minimums to scale the water table data, because the standard deviation is very small, 

it does not have atypical values and it does not follow a normal distribution (you have to check this)

We apply the maximum and minimum scaling. We provide a rank or base scale that **will be between 0 and 1** using an object,

[MinMaxScaler](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html "sklearn.preprocessing.MinMaxScaler") which transforms each characteristic, (in this 

case it will be the columns feature of (`Luz (lux)`) individually according to a given range.

Product of its applicability, generates these attributes in the dataset, already transformed: 

![alt text](https://cldup.com/lTIv4HXgTk-3000x3000.png "sklearn.preprocessing.MinMaxScaler")

### 3.1 We apply maximium and minimum feature scaling to Luminosity training dataset

In [58]:
# We provide a base scale range
scaler = MinMaxScaler(feature_range=(0, 1))

print("Remember our luminosity training data " + '\n', luminosity_luxes_l12w_train)

Remember our luminosity training data 
 [[    1]
 [ 2930]
 [14788]
 ...
 [  485]
 [    1]
 [43032]]


With the [fit](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler.fit  "MinMaxScaler.fit")
we compute the maximum and minimum value of  `luminosity_luxes_l12w_train` dataset to be used in the subsequent scaling 

We assing these values to `minmax_scale_training` variable.

In [59]:
minmax_scale_training = scaler.fit(luminosity_luxes_l12w_train.astype(float))
# print(minmax_scale_training.data_max_)
# http://terrapinssky.blogspot.com/2017/10/pythonresolved-dataconversionwarning.html

# transform luminosity_luxes_train data to maximum and mínimum scale value. 
luminosity_luxes_l12w_minmax_training = minmax_scale_training.transform(luminosity_luxes_l12w_train)

print ("And, these are our scaled data: " + '\n')
luminosity_luxes_l12w_minmax_training

And, these are our scaled data: 



array([[0.        ],
       [0.05497476],
       [0.27753899],
       ...,
       [0.00908425],
       [0.        ],
       [0.80765405]])

In [60]:
print('Luminosity Training dataset. Minimum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(luminosity_luxes_l12w_minmax_training[:,0].min()))

print('Luminosity Training dataset. Maximum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(luminosity_luxes_l12w_minmax_training[:,0].max()))

Luminosity Training dataset. Minimum value after MaxMinScaler:
Luminosity=0.0
Luminosity Training dataset. Maximum value after MaxMinScaler:
Luminosity=1.0


Then, our  MinMaxScaler normalized training dataset is `luminosity_luxes_l12w_minmax_training` numpy array

- We export this array to comma separated values

In [61]:
luminosity_luxes_l12w_train_df = pd.DataFrame(luminosity_luxes_l12w_minmax_training, columns=col)

In [62]:
print(luminosity_luxes_l12w_train_df.shape)
luminosity_luxes_l12w_train_df.head()

(3518, 1)


Unnamed: 0,Luz (lux)
0,0.0
1,0.054975
2,0.277539
3,0.100884
4,0.445072


In [63]:
luminosity_luxes_l12w_train_df[luminosity_luxes_l12w_train_df['Luz (lux)']>1]

Unnamed: 0,Luz (lux)


In this way we have the dataset `luminosity_luxes_l12w_train_df` standardized and training, and export it to a .csv file

In [64]:
luminosity_luxes_l12w_train_df.to_csv('../../../../data/processed/Luminosity/Latest_12-weeks_May-4_August-03_2018/' +'\n' 
                                 'Luminosity_Normalized_TRAINING_May-4_August-03.csv', sep=',', header=True, index=False)

### 3.2 We apply maximium and minimum feature scaling to Luminosity testing dataset

In [65]:
# print("Remember our luminosity testing data " + '\n', luminosity_luxes_l12w_test)
minmax_scale_test = scaler.fit(luminosity_luxes_l12w_test.astype(float))
# transform luminosity_luxes_test data to maximum and mínimum scale value. 
luminosity_luxes_l12w_minmax_test = minmax_scale_test.transform(luminosity_luxes_l12w_test)


In [66]:
print ("And, these are our testing scaled data: " + '\n')
print(luminosity_luxes_l12w_minmax_test.shape)

And, these are our testing scaled data: 

(880, 1)


In [67]:
print('Luminosity Testing dataset. Minimum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(luminosity_luxes_l12w_minmax_test[:,0].min()))

print('Luminosity Testing dataset. Maximum value after MaxMinScaler:\nLuminosity={:.1f}'
      .format(luminosity_luxes_l12w_minmax_test[:,0].max()))

Luminosity Testing dataset. Minimum value after MaxMinScaler:
Luminosity=0.0
Luminosity Testing dataset. Maximum value after MaxMinScaler:
Luminosity=1.0


Then, our  MinMaxScaler normalized training dataset is `luminosity_luxes_l12w_minmax_test` numpy array

- We export this array to comma separated values

In [68]:
luminosity_luxes_l12w_test_df = pd.DataFrame(luminosity_luxes_l12w_minmax_test, columns=col)

In [69]:
print(luminosity_luxes_l12w_test_df.shape)
luminosity_luxes_l12w_test_df.head()

(880, 1)


Unnamed: 0,Luz (lux)
0,0.807654
1,0.0
2,0.0
3,0.175041
4,0.0


In [70]:
luminosity_luxes_l12w_test_df[luminosity_luxes_l12w_test_df['Luz (lux)']>1]

Unnamed: 0,Luz (lux)


In this way we have the dataset luminosity_luxes_test_df standardized and testing, and export it to a .csv file

In [71]:
luminosity_luxes_l12w_test_df.to_csv('../../../../data/processed/Luminosity/Latest_12-weeks_May-4_August-03_2018/' +'\n' 
                                 'Luminosity_Normalized_TESTING_May-4_August-03.csv',  sep=',', header=True, index=False)