# Generación del clasificador para explotación 
## Grado en Ingeniería Informática. Universidad de Burgos
**Autor:** Alicia Olivares Gil

In [4]:
%matplotlib inline 
#para dibujar en el propio notebook
import pandas as pd # se importa pandas como pd
import numpy as np  #numpy como np
import matplotlib.pyplot as plt #matplotlib.pyplot como plot
import pickle as pk
import loadData as ld
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics as mtr
import seaborn as sns
import tsfresh as tf

### Cargar días de la crisis 1 y 2: 

In [5]:
with open('diacrisis1.pdd','rb') as f: 
    diacrisis1 = pk.load(f) 
print(len(diacrisis1))
diacrisis1.head()

121979


Unnamed: 0,DateTime,P1,P2,P3,P4,P5,P6,target
3396348,2018-11-09 21:02:53,0.0,18.852459,22.131148,22.95082,13.934426,6.557377,False
3396346,2018-11-09 21:02:53,0.0,19.672131,22.95082,23.770492,13.934426,7.377049,False
3396345,2018-11-09 21:02:53,0.0,18.852459,22.131148,22.95082,13.934426,6.557377,False
3396347,2018-11-09 21:02:53,0.0,19.672131,22.95082,22.95082,13.934426,6.557377,False
3396349,2018-11-09 21:02:53,0.0,18.852459,22.131148,22.95082,13.934426,6.557377,False


In [6]:
with open('diacrisis2.pdd','rb') as f: 
    diacrisis2 = pk.load(f) 
print(len(diacrisis2))

123525


### Función de extracción de características por ventana: 

In [14]:
import tsfresh as tf

def rolling_extract_features(dataFrame, window, fc_parameters): 
    """
    Calcula las características especificadas por fc_parameters del dataFrame según una ventana. 
    
    Parámetros: 
    dataFrame -- Datos incluyendo las columnas 'DateTime' y presiones 
    window -- ventana para el cálculo de las características 
    fc_parameters -- diccionario con las características que se quieren calcular 
    
    Retorno: 
    features -- DataFrame con las características incluyendo 'DateTime' y características
    """
    
    if len(dataFrame)<window: 
        raise Exception("La ventana debe ser menor o igual a la longitud del DataFrame.")
    
    #preparar el formato de los datos para pasárselos a la función de extracción de características 
    X,datetime = dataFrame, dataFrame['DateTime']
    X['id'] = 1
    X = X.reset_index(drop=True)
    
    #extracción de características por ventana 
    features = pd.DataFrame()
    for i in range(len(X)-window+1): 
        X_rolling = X.iloc[i:i+window]
        features_rolling = tf.extract_features(X_rolling, default_fc_parameters=fc_parameters, column_id='id', column_sort='DateTime')
        features = pd.concat([features,features_rolling],axis=0)
        
    #volver a añadir DateTime
    features.reset_index(drop=True, inplace=True)
    datetime = datetime[window-1:].reset_index(drop=True)
    features = pd.concat([datetime,features],axis=1)
    
    return features

### Extracción de características de los días de la crisis 1 y 2: 

In [15]:
#características a calcular 
#para más info: https://tsfresh.readthedocs.io/en/latest/text/list_of_features.html 
# y https://tsfresh.readthedocs.io/en/latest/text/feature_extraction_settings.html
fc_parameters = {
    "agg_linear_trend": [{"attr":"intercept", "chunk_len":5, "f_agg":"var"},{"attr":"stderr", "chunk_len":5, "f_agg":"min"}],
    "symmetry_looking": [{"r":0.6}],
    "change_quantiles":[{"ql":0.2,"qh":1.0,"isabs":False,"f_agg":"var"},{"ql":0.4,"qh":0.8,"isabs":False,"f_agg":"var"},{"ql":0.4,"qh":0.6,"isabs":True,"f_agg":"var"},{"ql":0.0,"qh":0.6,"isabs":True,"f_agg":"mean"},{"ql":0.4,"qh":1.0,"isabs":True,"f_agg":"mean"}],
    "last_location_of_minimum": None, 
    "number_peaks": [{"n":1}]
}

In [16]:
#ejemplo de cómo obtener las características 

#features_diacrisis1 = rolling_extract_features(X1,90,fc_parameters)
#features_diacrisis2 = rolling_extract_features(X2,90,fc_parameters)
features_prueba = rolling_extract_features(diacrisis1[:90], 90, fc_parameters)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Feature Extraction: 100%|██████████| 7/7 [00:00<00:00, 503.92it/s]


In [41]:
features_prueba.head()

Unnamed: 0,DateTime,"P1__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P1__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P1__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","P1__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","P1__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.4","P1__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.2","P1__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.4",P1__last_location_of_minimum,P1__number_peaks__n_1,...,"P6__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P6__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","P6__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","P6__change_quantiles__f_agg_""var""__isabs_False__qh_0.8__ql_0.4","P6__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.2","P6__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.4",P6__last_location_of_minimum,P6__number_peaks__n_1,P6__symmetry_looking__r_0.6,target
0,2018-11-09 21:04:00,0.053178,3.726911,1.163048,0.231983,0.0,4.64714,0.0,0.577778,13.0,...,0.025146,0.011228,0.149031,0.0,0.122157,0.0,1.0,8.0,1.0,False


### Guardar características: 

In [None]:
#guardar resultados
with open('seleccion_genetico_diacrisis1_prc.pdd','wb') as f:
    pk.dump(features_diacrisis1,f)
with open('seleccion_genetico_diacrisis2_prc.pdd','wb') as f: 
    pk.dump(features_diacrisis2,f)

### Cargar características: 

In [27]:
features_diacrisis1 = pk.load(open('seleccion_genetico_diacrisis1_prc.pdd','rb'))
features_diacrisis2 = pk.load(open('seleccion_genetico_diacrisis2_prc.pdd','rb'))
#concatenar los dos días 
features = pd.concat([features_diacrisis1,features_diacrisis2], axis = 0, ignore_index = True)
features.head()

Unnamed: 0,DateTime,"P1__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P2__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P3__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P4__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P5__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P6__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""",P1__symmetry_looking__r_0.6000000000000001,P2__symmetry_looking__r_0.6000000000000001,P3__symmetry_looking__r_0.6000000000000001,...,"P4__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","P5__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","P6__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","P1__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P2__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P3__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P4__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P5__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P6__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""",target
0,2018-11-09 21:04:01,3.726911,0.14836,0.006601,-0.11127,0.010687,0.025146,1.0,1.0,1.0,...,0.297131,0.219215,0.149031,0.053178,1.711944e-16,0.031777,0.038042,0.025058,0.008302,False
1,2018-11-09 21:04:02,3.56975,0.099326,-0.060664,-0.1273,-0.024831,0.015087,1.0,1.0,1.0,...,0.290517,0.221794,0.141323,0.053178,0.01660484,0.031777,0.038042,0.025058,0.008302,False
2,2018-11-09 21:04:03,3.222425,0.100897,-0.03709,-0.055321,-0.004715,-0.006915,1.0,1.0,1.0,...,0.283733,0.224434,0.133435,0.064143,0.01660484,0.031777,0.038449,0.025058,0.008302,False
3,2018-11-09 21:04:03,3.662475,0.133587,-0.037719,-0.05312,-0.010373,0.008487,1.0,1.0,1.0,...,0.287418,0.227138,0.133435,0.049745,0.01660484,0.030731,0.038473,0.023893,0.008302,False
4,2018-11-09 21:04:04,3.99597,0.143645,0.034261,0.02766,0.036461,0.010373,1.0,1.0,1.0,...,0.291199,0.229908,0.133435,0.052657,0.01660484,0.030731,0.036501,0.024316,0.008302,False


### Entrenar clasificador 

In [32]:
X = features.drop(['DateTime','target'], axis=1)
y = features['target']

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics as mtr

rfc = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

rfc.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Guardar clasificador

In [36]:
with open('rfc.pkl','wb') as f: 
    pk.dump(rfc,f)