# Generación del clasificador para explotación 
## Grado en Ingeniería Informática. Universidad de Burgos
**Autor:** Alicia Olivares Gil

In [1]:
%matplotlib inline 
#para dibujar en el propio notebook
import pandas as pd # se importa pandas como pd
import numpy as np  #numpy como np
import matplotlib.pyplot as plt #matplotlib.pyplot como plot
import pickle as pk
import loadData as ld
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics as mtr
import seaborn as sns
import tsfresh as tf

### Cargar días de la crisis 1 y 2: 

In [2]:
with open('diacrisis1.pdd','rb') as f: 
    diacrisis1 = pk.load(f) 
print(len(diacrisis1))

121979


In [3]:
with open('diacrisis2.pdd','rb') as f: 
    diacrisis2 = pk.load(f) 
print(len(diacrisis2))

123525


### Función de extracción de características por ventana: 

In [4]:
import tsfresh as tf

def rolling_extract_features(X, window, fc_parameters): 
    features = pd.DataFrame()
    for i in range(len(X)-window): 
        X_rolling = X.iloc[i:i+window]
        features_rolling = tf.extract_features(X_rolling, default_fc_parameters=fc_parameters, column_id='id', column_sort='DateTime')
        features = pd.concat([features,features_rolling],axis=0)
    return features

### Extracción de características de los días de la crisis 1 y 2: 

In [5]:
#características a calcular 
#para más info: https://tsfresh.readthedocs.io/en/latest/text/list_of_features.html 
# y https://tsfresh.readthedocs.io/en/latest/text/feature_extraction_settings.html
fc_parameters = {
    "agg_linear_trend": [{"attr":"intercept", "chunk_len":5, "f_agg":"var"},{"attr":"stderr", "chunk_len":5, "f_agg":"min"}],
    "symmetry_looking": [{"r":0.6}],
    "change_quantiles":[{"ql":0.2,"qh":1.0,"isabs":False,"f_agg":"var"},{"ql":0.4,"qh":0.8,"isabs":False,"f_agg":"var"},{"ql":0.4,"qh":0.6,"isabs":True,"f_agg":"var"},{"ql":0.0,"qh":0.6,"isabs":True,"f_agg":"mean"},{"ql":0.4,"qh":1.0,"isabs":True,"f_agg":"mean"}],
    "last_location_of_minimum": None, 
    "number_peaks": [{"n":1}]
}

In [None]:
X1,y1 = diacrisis1.drop(['target'],axis=1), diacrisis1['target']
X1['id'] = 1
X1 = X1.reset_index(drop=True)
X2,y2 = diacrisis2.drop(['target'],axis=1), diacrisis2['target']
X2['id'] = 1
X2 = X2.reset_index(drop=True)
X1.head()

In [None]:
#ejemplo de cómo obtener las características 

#features_diacrisis1 = rolling_extract_features(X1,90,fc_parameters)
#features_diacrisis2 = rolling_extract_features(X2,90,fc_parameters)

### Añadir target a las características calculadas: 

In [None]:
features_diacrisis1.reset_index(drop=True)
y=y1[90:].reset_index(drop=True)
features_diacrisis1 = pd.concat([features_diacrisis1,y],axis=1)
features_diacrisis2.reset_index(drop=True)
y=y2[90:].reset_index(drop=True)
features_diacrisis2 = pd.concat([features_diacrisis2,y],axis=1)

### Guardar características: 

In [None]:
#guardar resultados
with open('seleccion_genetico_diacrisis1_prc.pdd','wb') as f:
    pk.dump(features_diacrisis1,f)
with open('seleccion_genetico_diacrisis2_prc.pdd','wb') as f: 
    pk.dump(features_diacrisis2,f)

### Cargar características: 

In [27]:
features_diacrisis1 = pk.load(open('seleccion_genetico_diacrisis1_prc.pdd','rb'))
features_diacrisis2 = pk.load(open('seleccion_genetico_diacrisis2_prc.pdd','rb'))
#concatenar los dos días 
features = pd.concat([features_diacrisis1,features_diacrisis2], axis = 0, ignore_index = True)
features.head()

Unnamed: 0,DateTime,"P1__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P2__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P3__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P4__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P5__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""","P6__agg_linear_trend__f_agg_""var""__chunk_len_5__attr_""intercept""",P1__symmetry_looking__r_0.6000000000000001,P2__symmetry_looking__r_0.6000000000000001,P3__symmetry_looking__r_0.6000000000000001,...,"P4__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","P5__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","P6__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","P1__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P2__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P3__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P4__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P5__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""","P6__agg_linear_trend__f_agg_""min""__chunk_len_5__attr_""stderr""",target
0,2018-11-09 21:04:01,3.726911,0.14836,0.006601,-0.11127,0.010687,0.025146,1.0,1.0,1.0,...,0.297131,0.219215,0.149031,0.053178,1.711944e-16,0.031777,0.038042,0.025058,0.008302,False
1,2018-11-09 21:04:02,3.56975,0.099326,-0.060664,-0.1273,-0.024831,0.015087,1.0,1.0,1.0,...,0.290517,0.221794,0.141323,0.053178,0.01660484,0.031777,0.038042,0.025058,0.008302,False
2,2018-11-09 21:04:03,3.222425,0.100897,-0.03709,-0.055321,-0.004715,-0.006915,1.0,1.0,1.0,...,0.283733,0.224434,0.133435,0.064143,0.01660484,0.031777,0.038449,0.025058,0.008302,False
3,2018-11-09 21:04:03,3.662475,0.133587,-0.037719,-0.05312,-0.010373,0.008487,1.0,1.0,1.0,...,0.287418,0.227138,0.133435,0.049745,0.01660484,0.030731,0.038473,0.023893,0.008302,False
4,2018-11-09 21:04:04,3.99597,0.143645,0.034261,0.02766,0.036461,0.010373,1.0,1.0,1.0,...,0.291199,0.229908,0.133435,0.052657,0.01660484,0.030731,0.036501,0.024316,0.008302,False


### Entrenar clasificador 

In [32]:
X = features.drop(['DateTime','target'], axis=1)
y = features['target']

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics as mtr

rfc = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

rfc.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Guardar clasificador

In [36]:
with open('rfc.pkl','wb') as f: 
    pk.dump(rfc,f)