# Random Forest - Filtrado de características en series temporales
## Grado en Ingeniería Informática. Universidad de Burgos
**Autor:** Alicia Olivares Gil

In [1]:
%matplotlib inline 
#para dibujar en el propio notebook
import pandas as pd # se importa pandas como pd
import numpy as np  #numpy como np
import matplotlib.pyplot as plt #matplotlib.pyplot como plot
import pickle as pk
import loadData as ld
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics as mtr
import seaborn as sns
import tsfresh as tf

## Cargar características: 

In [2]:
features_diacrisis1_p1 = pk.load(open('features_diacrisis1_p1.pdd','rb'))
features_diacrisis1_p2 = pk.load(open('features_diacrisis1_p2.pdd','rb'))
features_diacrisis1 = pd.concat([features_diacrisis1_p1, features_diacrisis1_p2], axis = 0, ignore_index = True) 
features_diacrisis2_p1 = pk.load(open('features_diacrisis2_p1.pdd','rb'))
features_diacrisis2_p2 = pk.load(open('features_diacrisis2_p2.pdd','rb'))
features_diacrisis2 = pd.concat([features_diacrisis2_p1, features_diacrisis2_p2], axis = 0, ignore_index = True)   
print('Filas característcias crisis 1: ',features_diacrisis1.shape[0])
print('Filas característcias crisis 2: ',features_diacrisis2.shape[0])
features = pd.concat([features_diacrisis1,features_diacrisis2], axis = 0, ignore_index = True)
targets = features.target
features = features.drop(['target'],axis=1)
print('Filas característcias total: ',features.shape[0])
print('Número de características:',features.shape[1])
features.head()

Filas característcias crisis 1:  121889
Filas característcias crisis 2:  123435
Filas característcias total:  245324
Número de características: 4764


Unnamed: 0,P1__abs_energy,P1__absolute_sum_of_changes,"P1__agg_autocorrelation__f_agg_""mean""__maxlag_40","P1__agg_autocorrelation__f_agg_""median""__maxlag_40","P1__agg_autocorrelation__f_agg_""var""__maxlag_40","P1__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","P1__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","P1__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","P1__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","P1__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,P6__symmetry_looking__r_0.9,P6__symmetry_looking__r_0.9500000000000001,P6__time_reversal_asymmetry_statistic__lag_1,P6__time_reversal_asymmetry_statistic__lag_2,P6__time_reversal_asymmetry_statistic__lag_3,P6__value_count__value_-1,P6__value_count__value_0,P6__value_count__value_1,P6__variance,P6__variance_larger_than_standard_deviation
0,1159.634507,108.196721,0.149281,0.10734,0.01473,4.35337,0.479257,0.095628,0.066192,4.918033,...,1.0,1.0,-1.602056,-0.864482,0.216349,0.0,0.0,0.0,0.063122,0.0
1,1176.431067,109.016393,0.143872,0.116259,0.015316,4.35337,0.479257,0.095628,0.066192,4.918033,...,1.0,1.0,-2.25915,-1.63291,-1.671789,0.0,0.0,0.0,0.071665,0.0
2,1193.227627,109.016393,0.136717,0.126383,0.014767,4.35337,0.479257,0.095628,0.066192,4.918033,...,1.0,1.0,-0.7509639,-1.524049,-0.780168,0.0,0.0,0.0,0.065113,0.0
3,1217.414673,109.836066,0.132745,0.144309,0.01477,4.31694,0.387298,0.081967,0.073749,4.918033,...,1.0,1.0,-3.875688e-15,-1.524049,-0.780168,0.0,0.0,0.0,0.065113,0.0
4,1241.60172,109.836066,0.126637,0.11889,0.017547,4.371585,0.322749,0.068306,0.075711,4.918033,...,1.0,1.0,-3.875688e-15,-0.755621,-1.560336,0.0,0.0,0.0,0.065113,0.0


## Eliminar atributos con NaN: 

In [3]:
aux = features.shape[1]

features = features.dropna(1)

print('Número de características:',features.shape[1])
print('Se han eliminado', aux-features.shape[1],'características')

Número de características: 3200
Se han eliminado 1564 características


## Eliminar atributos que fallan (por alguna razón que no entiendo): 

In [4]:
aux = features.shape[1]

features = features.drop(['P1__sample_entropy','P2__sample_entropy','P3__sample_entropy','P4__sample_entropy','P5__sample_entropy','P6__sample_entropy'], axis=1)

print('Número de características:',features.shape[1])
print('Se han eliminado', aux-features.shape[1],'características')

Número de características: 3194
Se han eliminado 6 características


## Eliminar características con baja varianza: 

In [5]:
aux = features.shape[1]

from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(0.01))

sel.fit(features)
features = features[features.columns[sel.get_support(indices=True)]]

print('Número de características:',features.shape[1])
print('Se han eliminado', aux-features.shape[1],'características')

Número de características: 2561
Se han eliminado 633 características


## Select 1000 Best (chi2): 

In [6]:
from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler()
columns = features.columns
features[columns] = scaler.fit_transform(features)

In [7]:
aux = features.shape[1]

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

sel = SelectKBest(chi2, k=1000)
sel.fit(features, targets)
features = features[features.columns[sel.get_support(indices=True)]]

print('Número de características:',features.shape[1])
print('Se han eliminado', aux-features.shape[1],'características')

Número de características: 1000
Se han eliminado 1561 características


## Select from model (Random Forest): 

In [10]:
aux = features.shape[1]

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
sfm = SelectFromModel(rf, threshold=0.04)
sfm.fit(features, targets)
features = features[features.columns[sfm.get_support(indices=True)]]

print('Número de características:',features.shape[1])
print('Se han eliminado', aux-features.shape[1],'características')



Número de características: 8
Se han eliminado 24 características


## Entrenar y aplicar Random Forest: 

In [11]:
from sklearn.ensemble import RandomForestClassifier

features_diacrisis1 = features[:121889]
features_diacrisis2 = features[121889:]
targets_diacrisis1 = targets[:121889]
targets_diacrisis2 = targets[121889:]

rfc = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

rfc.fit(features_diacrisis1, targets_diacrisis1)
y_pred = rfc.predict_proba(features_diacrisis2)
y_score = [i[0] for i in y_pred]
roc1 = mtr.roc_auc_score(targets_diacrisis2,y_score)

rfc.fit(features_diacrisis2, targets_diacrisis2)
y_pred = rfc.predict_proba(features_diacrisis1)
y_score = [i[0] for i in y_pred]
roc2 = mtr.roc_auc_score(targets_diacrisis1,y_score)

roc= (roc1+roc2)/2
print('roc:',roc)

roc: 0.4476938225079916
