# Imports generales

In [1]:
!pip install scikit-optimize
import time
from scipy.stats import randint, uniform

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, TimeSeriesSplit

import pandas as pd
import numpy as np
import seaborn as sns  # visualisation
import matplotlib.pyplot as plt  # visualisation

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.1-py2.py3-none-any.whl (107 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m61.4/107.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.12.0-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.12.0 scikit-optimize-0.10.1


# Leer archivo

In [2]:
from google.colab import drive

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/wind_ava.csv'
df = pd.read_csv(file_path)
df.head(5)

Mounted at /content/drive


Unnamed: 0,datetime,energy,p54.162.1,p54.162.2,p54.162.3,p54.162.4,p54.162.5,p54.162.6,p54.162.7,p54.162.8,...,v100.16,v100.17,v100.18,v100.19,v100.20,v100.21,v100.22,v100.23,v100.24,v100.25
0,2005-01-02 18:00:00,402.71,2534970.0,2526864.0,2518754.0,2510648.0,2502537.0,2531111.0,2522721.0,2514330.0,...,-4.683596,-4.545396,-4.407196,-4.268996,-4.131295,-4.669626,-4.528932,-4.388736,-4.24854,-4.107846
1,2005-01-03 00:00:00,696.8,2537369.0,2529277.0,2521184.0,2513088.0,2504995.0,2533465.0,2525088.0,2516716.0,...,-3.397886,-3.257192,-3.115998,-2.975304,-2.834609,-3.39639,-3.254198,-3.112506,-2.970314,-2.828622
2,2005-01-03 06:00:00,1591.15,2533727.0,2525703.0,2517678.0,2509654.0,2501629.0,2529801.0,2521496.0,2513187.0,...,-1.454105,-1.296447,-1.13829,-0.980134,-0.822476,-1.459094,-1.302933,-1.147271,-0.99111,-0.834949
3,2005-01-03 12:00:00,1338.62,2534491.0,2526548.0,2518609.0,2510670.0,2502732.0,2530569.0,2522346.0,2514127.0,...,1.255015,1.370265,1.485515,1.600765,1.716015,1.210612,1.319376,1.42814,1.536405,1.645169
4,2005-01-03 18:00:00,562.5,2529543.0,2521623.0,2513702.0,2505782.0,2497861.0,2525621.0,2517421.0,2509215.0,...,1.939031,2.023847,2.108663,2.193977,2.278793,1.873673,1.953,2.031829,2.111157,2.189986


# EDA

## Selección de columnas relevantes

Se eliminan las columnas que no pertenecen al parque Sotavento

In [3]:
# Eliminación de variables meteorológicas no correspondientes a la localización 13
print("Columnas relevantes:")
relevant_columns = [col for col in df.columns if col.endswith(".13") or col in ['datetime', 'energy']]
df_relevant = df[relevant_columns]
print("Número de columnas relevantes: ", len(df_relevant.columns))

Columnas relevantes:
Número de columnas relevantes:  24


## Preparar datos para entrenamiento



Debemos separar los datos entre 'Train' y 'Test'. Para ello hemos decidido almacenar en 'Train' todos los datos anteriores al "parón" de 2008 y en 'Test' los posteriores.

In [4]:
df_relevant['datetime'] = pd.to_datetime(df_relevant['datetime'])

df_relevant['year'] = df_relevant['datetime'].dt.year

# Filter the DataFrame where 'year' is equal to 2018
filtered_df_2008 = df_relevant[df_relevant['year'] == 2008]

# Find the highest index of the filtered DataFrame
highest_index_2008 = filtered_df_2008.index.max()

print("Highest index of an element with 'year' equal to 2008:", highest_index_2008)

# Filter the DataFrame where 'year' is equal to 2009
filtered_df = df_relevant[df_relevant['year'] == 2009]

# Find the first index of the filtered DataFrame
first_index_2009 = filtered_df.index.min()

print("First index of an element with 'year' equal to 200:", first_index_2009)

Highest index of an element with 'year' equal to 2008: 3826
First index of an element with 'year' equal to 200: 3827


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['datetime'] = pd.to_datetime(df_relevant['datetime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['year'] = df_relevant['datetime'].dt.year


In [5]:
from sklearn.model_selection import train_test_split

# Drop unnecessary columns
df_relevant = df_relevant.drop(columns=['datetime', 'year'])

# Splitting indices
train_index_end = 3826
test_index_start = 3827

# Splitting data
X,y = df_relevant.drop(columns=['energy']),df_relevant['energy']
X_train, X_test = X.iloc[:train_index_end + 1], X.iloc[test_index_start:]
y_train, y_test = y.iloc[:train_index_end + 1], y.iloc[test_index_start:]

# Sanity check
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3827, 22)
y_train shape: (3827,)
X_test shape: (921, 22)
y_test shape: (921,)


# Problema de clasificación

## Convertir en problema de clasificación

Se busca el tercer cuantil y se clasifican entre energía alta y baja

In [6]:
third_quantile = df_relevant['energy'].quantile(0.75)
print(third_quantile)

# Asignar clases basadas en el tercer cuartil
#predictions_df['predicted_class'] = np.where(predictions_df['predicted_energy'] < third_quantile, 'low', 'high')
df_relevant['predicted_class'] = np.where(df_relevant['energy'] < third_quantile, 'low', 'high')

1089.375


###Preparar datos para clasificación

In [7]:
X_class, y_class = df_relevant.drop(columns=['energy', 'predicted_class']), df_relevant['predicted_class']

from sklearn.preprocessing import LabelEncoder

# Encode the string labels into numerical labels One Hot Encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_class)

# Dividir los datos
X_train, X_test = X_class.loc[:highest_index_2008], X_class.loc[first_index_2009:]
y_train, y_test = y_encoded[:highest_index_2008 + 1], y_encoded[first_index_2009:]

# Verificación
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3827, 22)
y_train shape: (3827,)
X_test shape: (921, 22)
y_test shape: (921,)


#Seleccionar método de escalado

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score

# Initialize dictionaries to store scores
scores = {}

# MinMaxScaler
pipeline_min_max = Pipeline([
    ('scaler', MinMaxScaler()),
    ('knn', KNeighborsClassifier())
])
pipeline_min_max.fit(X_train, y_train)
y_pred_min_max = pipeline_min_max.predict(X_test)
f1_min_max = f1_score(y_test, y_pred_min_max, average='weighted')
scores["MinMaxScaler"] = f1_min_max

# StandardScaler
pipeline_standard = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])
pipeline_standard.fit(X_train, y_train)
y_pred_standard = pipeline_standard.predict(X_test)
f1_standard = f1_score(y_test, y_pred_standard, average='weighted')
scores["StandardScaler"] = f1_standard

# RobustScaler
pipeline_robust = Pipeline([
    ('scaler', RobustScaler()),
    ('knn', KNeighborsClassifier())
])
pipeline_robust.fit(X_train, y_train)
y_pred_robust = pipeline_robust.predict(X_test)
f1_robust = f1_score(y_test, y_pred_robust, average='weighted')
scores["RobustScaler"] = f1_robust

# Print results
print("F1 Score for MinMaxScaler:", scores["MinMaxScaler"])
print("F1 Score for StandardScaler:", scores["StandardScaler"])
print("F1 Score for RobustScaler:", scores["RobustScaler"])

F1 Score for MinMaxScaler: 0.817014881064433
F1 Score for StandardScaler: 0.8269146902845452
F1 Score for RobustScaler: 0.835129234497837


Para clasificación el mejor escalador es el Robust ya que da el F1-score más alto.

### KNN Classifier

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

knn_class = Pipeline([
    ('scaler', RobustScaler()),  # Escalador RobustScaler
    ('knn', KNeighborsClassifier())  # Modelo KNN
])

# Definir el espacio de parámetros para GridSearchCV
param_grid_knn_class = {
    'knn__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]
}

# Inicializar GridSearchCV para el modelo de clasificación KNN
knn_class_model = GridSearchCV(knn_class,
                               param_grid_knn_class,
                               scoring='f1',  # Usar f1_weighted para problemas multiclase
                               cv=TimeSeriesSplit(n_splits=3),
                               n_jobs=1,
                               verbose=1)

# Ajustar el modelo con los datos de entrenamiento
inicio = time.time()
knn_class_model.fit(X_train, y_train)
fin = time.time()

# Predecir sobre el conjunto de prueba
y_pred_knn_class = knn_class_model.predict(X_test)

# Calcular el F1 score
f1_knn_class = f1_score(y_test, y_pred_knn_class, average='weighted')

print("F1 Score para KNN Classification:", f1_knn_class)
print("Best parameters:", knn_class_model.best_params_)

# Calcular el tiempo transcurrido
elapsed_time = fin - inicio
print("Elapsed time:", elapsed_time, "seconds")

Fitting 3 folds for each of 44 candidates, totalling 132 fits
F1 Score para KNN Classification: 0.8398310675844494
Best parameters: {'knn__n_neighbors': 15, 'knn__p': 2, 'knn__weights': 'distance'}
Elapsed time: 36.0717339515686 seconds


### Random Forest Classifier

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
rf_class = Pipeline([
    ('rf', RandomForestClassifier())
])

param_grid_rf_class = {
    'rf__n_estimators': [50, 100, 150],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__bootstrap': [True, False],
    'rf__class_weight': ['balanced', None]
}

rf_class_model = BayesSearchCV(rf_class,
                               param_grid_rf_class,
                               scoring='f1',
                               cv=TimeSeriesSplit(n_splits=3),
                               n_jobs=1,
                               verbose=1,
                               n_iter=2)

inicio = time.time()
rf_class_model.fit(X_train, y_train)
fin = time.time()

y_pred_rf_class = rf_class_model.predict(X_test)

f1_rf_class = f1_score(y_test, y_pred_rf_class, average='weighted')

print("F1 Score para RandomForest Classification:", f1_rf_class)
print("Best parameters:", rf_class_model.best_params_)
elapsed_time = fin - inicio
print("Elapsed time:", elapsed_time, "seconds")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
F1 Score para RandomForest Classification: 0.8588416724105079
Best parameters: OrderedDict([('rf__bootstrap', True), ('rf__class_weight', None), ('rf__max_depth', None), ('rf__min_samples_leaf', 1), ('rf__min_samples_split', 5), ('rf__n_estimators', 50)])
Elapsed time: 8.978439569473267 seconds


### Decission Tree Classifier

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

dt_class = Pipeline([
    ('scaler', RobustScaler()),
    ('dt', DecisionTreeClassifier())
])

param_grid_dt_class = {
    'dt__max_depth': [None, 5, 10, 15, 20],
    'dt__min_samples_split': [2, 5, 10],
    'dt__min_samples_leaf': [1, 2, 4],
    'dt__max_features': ['sqrt', 'log2'],
    'dt__class_weight': ['balanced', None]
}

dt_class_model = GridSearchCV(dt_class,
                              param_grid_dt_class,
                              scoring='f1',
                              cv=TimeSeriesSplit(n_splits=3),
                              n_jobs=1,
                              verbose=1)

inicio = time.time()
dt_class_model.fit(X_train, y_train)
fin = time.time()

y_pred_dt_class = dt_class_model.predict(X_test)

f1_dt_class = f1_score(y_test, y_pred_dt_class, average='weighted')

print("F1 Score para Decision Tree Classification:", f1_dt_class)
print("Best parameters:", dt_class_model.best_params_)

elapsed_time = fin - inicio
print("Elapsed time:", elapsed_time, "seconds")

Fitting 3 folds for each of 180 candidates, totalling 540 fits
F1 Score para Decision Tree Classification: 0.8422864065328287
Best parameters: {'dt__class_weight': None, 'dt__max_depth': 5, 'dt__max_features': 'sqrt', 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 2}
Elapsed time: 13.5190110206604 seconds


### SVM Classifier

In [16]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

svm_class = Pipeline([
    ('scaler', RobustScaler()),  # Escalador RobustScaler
    ('svm', SVC())  # Modelo SVM Classifier
])

param_grid_svm_class = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf'],
    'svm__gamma': ['scale', 'auto'],
    'svm__class_weight': [None, 'balanced']
}

svm_class_model = GridSearchCV(svm_class,
                               param_grid_svm_class,
                               scoring='f1',
                               cv=TimeSeriesSplit(n_splits=3),
                               n_jobs=1,
                               verbose=1)

inicio = time.time()
svm_class_model.fit(X_train, y_train)
fin = time.time()
y_pred_svm_class = svm_class_model.predict(X_test)

f1_svm_class = f1_score(y_test, y_pred_svm_class, average='weighted')

print("F1 Score para SVM Classification:", f1_svm_class)
print("Best parameters:", svm_class_model.best_params_)

elapsed_time = fin - inicio
print("Elapsed time:", elapsed_time, "seconds")

Fitting 3 folds for each of 24 candidates, totalling 72 fits
F1 Score para SVM Classification: 0.861371157471188
Best parameters: {'svm__C': 1, 'svm__class_weight': None, 'svm__gamma': 'auto', 'svm__kernel': 'rbf'}
Elapsed time: 45.6844162940979 seconds


## Elegir mejor modelo

El mejor modelo es SVM classifier ya que de todos los modelos es el que más mayor valor de F1-score tiene.