In [None]:
import os
import pandas as pd
import datetime
import glob
from google.colab import files
import numpy as np
import re
#import dask.dataframe as dd
#from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

#NO EJECUTAR --> CARGA TOTAL NO NECESARIO SI SE TIENE EL ARVCHIVO

In [None]:
i2m = list(zip(range(1,13), ['Gener','Febrer','Marc','Abril','Maig','Juny','Juliol','Agost','Setembre','Octubre','Novembre','Desembre']))
for year in range(2022, 2019, -1):
    for month, month_name in i2m:
        if (month > 5) and (year>2023): continue
        #Descarrega arxius de la web de Bicing
        os.system(f'curl -L -o "{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z" "https://opendata-ajuntament.barcelona.cat/resources/bcn/BicingBCN/{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z"')
        #Extreu fitxers del format 7z
        os.system(f"7z x '{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z'")
        #Elimina l'arxiu comprimit
        os.system(f"rm '{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z'")


In [None]:
import os
import re
fileList = os.listdir()

# Regex que coincideix amb: 2020_02_Febrer_BicingNou_ESTACIONS.csv
pattern = re.compile(r"^(202[0-3])_\d{2}_[A-Za-zÀ-ÿ]+_BicingNou_ESTACIONS\.csv$")

listCsv = [file for file in fileList if file.endswith(".csv") and pattern.match(file)]
listCsv = sorted(listCsv)
listCsv

In [None]:
import dask.dataframe as dd
import os
from dask.diagnostics import ProgressBar

def transform_bike_data_dask(listCsv):
    dfs = []
    for file in listCsv:
        print(f"Loading {file}...")
        ddf = dd.read_csv(
            file,
            dtype={
                "station_id": str,
                "last_reported": "float64",
                "is_returning": "object"
            },
            assume_missing=True
        )
        ddf["source_file"] = file  # opcional, para trazabilidad
        dfs.append(ddf)

    # Unir todos los CSVs en un único Dask DataFrame
    ddf = dd.concat(dfs, axis=0)

    # Procesamiento
    ddf["last_updated"] = dd.to_datetime(ddf["last_updated"], unit="s")
    # Usar 'h' en lugar de 'H' para evitar la advertencia futura
    ddf["last_updated_hour"] = ddf["last_updated"].dt.floor("h")

    ddf["total_docks"] = ddf["num_bikes_available"] + ddf["num_docks_available"]

    #Como en algunas rows del dataset la suma de "num_bikes_available" + "num_docks_available" es 0 (algo no lógico), este código mira para un station_id cual es el numero más frecuente de Docks para distintos horas, días y meses y se queda con el número mayoritario.
    most_frequent_docks = ddf.groupby("station_id")["total_docks"].apply(lambda x: x.mode()[0]).compute()

    # Esta función imputa el número de docks para aquellos casos en que tanto el número de bicis disponibles y docks sea 0 (casos en los que se genera NaN)
    def impute_docks(df, most_frequent_docks):
        for station_id, frequent_docks in most_frequent_docks.items():
            mask = (df["station_id"] == station_id) & (df["num_bikes_available"] == 0) & (df["num_docks_available"] == 0)
            df.loc[mask, "num_docks_available"] = frequent_docks - df.loc[mask, "num_bikes_available"]
        return df

    # Inocamos la función e imputamos los valores sustitutivos para "num_docks_available" y "num_bikes_available"
    ddf = ddf.map_partitions(impute_docks, most_frequent_docks=most_frequent_docks)

    ddf["percentage_docks_available"] = ddf["num_docks_available"] / ddf["total_docks"]

    # Agrupar por estación y hora
    grouped = (
        ddf.groupby(["station_id", "last_updated_hour"])
        .mean(numeric_only=True)
        .reset_index()
    )

    # Extraer componentes temporales
    grouped["month"] = grouped["last_updated_hour"].dt.month
    grouped["day"] = grouped["last_updated_hour"].dt.day
    grouped["hour"] = grouped["last_updated_hour"].dt.hour

    # Forzar el cómputo para trabajar con Pandas y evitar problemas de índice
    with ProgressBar():
        grouped_pd = grouped.compute()

    # Ordenar por station_id y last_updated_hour para aplicar shift correctamente
    grouped_pd = grouped_pd.sort_values(["station_id", "last_updated_hour"]).reset_index(drop=True)

    # Calcular ctx-0 (valor actual)
    grouped_pd["ctx-0"] = grouped_pd["percentage_docks_available"]

    # Calcular ctx-1 a ctx-4 de forma continua por station_id
    for i in range(1, 5):
        grouped_pd[f"ctx-{i}"] = (
            grouped_pd.groupby("station_id")["percentage_docks_available"]
            .shift(i)
            .fillna(0)
        )

    # Seleccionar columnas finales
    final_cols = [
        "station_id", "month", "day", "hour",
        "ctx-4", "ctx-3", "ctx-2", "ctx-1", "ctx-0"
    ]
    final_df = grouped_pd[final_cols]

    return final_df

1. Carregar tots els fitxers CSV en dataframes i, a continuació, calcular la mitjana per hora de les variables (fem la mitjana per a cada estació station_id i hora), intentem usar Dask però no és l'òptim per a nosaltres. A més afegim la columna de mes (month_num) i any (year) per quan unim tots els diferents mesos saber d'on provenien les observacions, i les variables 'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1', 'percentage_docks_available', 'percentage_bikes_mech', 'percentage_bikes_e'.

In [None]:
listCsv = sorted(listCsv)

df_result = transform_bike_data_dask(listCsv)

In [None]:
df_result.to_csv("df_result_with_year_no_null_co.csv", index=False)


# EJECUTAR DESDE AQUI

## LOAD BIG DATASET, NAS AND CREATE X,Y FOR TRAIN

In [None]:
#https://www.kaggle.com/competitions/2025-bike-availability-prediction/data?select=metadata_sample_submission_2025.csv

In [None]:
import pandas as pd

df = pd.read_csv('df_result_with_year.csv')

print(df)

          station_id  year  month  day  hour     ctx-4     ctx-3     ctx-2  \
0                  1  2019     12   31    23  0.000000  0.000000  0.000000   
1                  1  2020      1    1     0  0.000000  0.000000  0.000000   
2                  1  2020      1    1     1  0.000000  0.000000  0.509470   
3                  1  2020      1    1     2  0.000000  0.509470  0.469697   
4                  1  2020      1    1     3  0.509470  0.469697  0.407343   
...              ...   ...    ...  ...   ...       ...       ...       ...   
12694770          99  2022     12   31    19  0.468254  0.657738  0.615079   
12694771          99  2022     12   31    20  0.657738  0.615079  0.686508   
12694772          99  2022     12   31    21  0.615079  0.686508  0.603175   
12694773          99  2022     12   31    22  0.686508  0.603175  0.571429   
12694774          99  2022     12   31    23  0.603175  0.571429  0.567460   

             ctx-1     ctx-0  
0         0.000000  0.509470  
1

In [None]:
# Contar valores NaN por columna
print("Valores NaN por columna:")
print(df.isnull().sum())

# Contar valores 0 por columna (ten en cuenta que para columnas numéricas)
print("\nValores 0 por columna:")
print((df == 0).sum())

# Eliminar filas con NaNs
df = df.dropna()

# Verificar que se han eliminado los NaNs
print("\nValores NaN por columna después de limpiar:")
print(df.isnull().sum())



Valores NaN por columna:
station_id       0
year             0
month            0
day              0
hour             0
ctx-4            0
ctx-3            0
ctx-2            0
ctx-1            0
ctx-0         9012
dtype: int64

Valores 0 por columna:
station_id         0
year               0
month              0
day                0
hour          526148
ctx-4          88626
ctx-3          88112
ctx-2          87601
ctx-1          87089
ctx-0          77575
dtype: int64

Valores NaN por columna después de limpiar:
station_id    0
year          0
month         0
day           0
hour          0
ctx-4         0
ctx-3         0
ctx-2         0
ctx-1         0
ctx-0         0
dtype: int64


In [None]:
df=df[df['year']>=2021]

In [None]:
X_train = df.iloc[:, :9]
X_train.columns

Index(['station_id', 'year', 'month', 'day', 'hour', 'ctx-4', 'ctx-3', 'ctx-2',
       'ctx-1'],
      dtype='object')

In [None]:
y_train = df.iloc[:, 9:]
y_train.columns

Index(['ctx-0'], dtype='object')

## LOAD X TEST

In [None]:
import pandas as pd

# Ruta al archivo CSV
ruta_csv = "metadata_sample_submission_2025.csv"

# Carga del CSV en un DataFrame
X_test = pd.read_csv(ruta_csv)

# Visualizar las primeras filas
print(X_test.head())

   index  station_id  month  day  hour     ctx-4     ctx-3     ctx-2     ctx-1
0      0           1      6    1     3  0.490942  0.378623  0.324275  0.311594
1      1           1      6    1     8  0.271739  0.311594  0.346014  0.394928
2      2           1      6    1    13  0.538043  0.650362  0.697464  0.721014
3      3           1      6    1    18  0.789855  0.800725  0.791667  0.807971
4      4           1      6    1    23  0.860507  0.871377  0.817029  0.793478


In [None]:
X_test['year'] = 2024


In [None]:
X_test

Unnamed: 0,index,station_id,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,year
0,0,1,6,1,3,0.490942,0.378623,0.324275,0.311594,2024
1,1,1,6,1,8,0.271739,0.311594,0.346014,0.394928,2024
2,2,1,6,1,13,0.538043,0.650362,0.697464,0.721014,2024
3,3,1,6,1,18,0.789855,0.800725,0.791667,0.807971,2024
4,4,1,6,1,23,0.860507,0.871377,0.817029,0.793478,2024
...,...,...,...,...,...,...,...,...,...,...
401506,401506,496,12,31,2,0.865741,0.643519,0.597222,0.921296,2024
401507,401507,496,12,31,7,1.000000,0.388889,0.375000,0.407407,2024
401508,401508,496,12,31,12,0.310185,0.337963,0.402778,0.495370,2024
401509,401509,496,12,31,17,0.462963,0.564815,0.629630,0.583333,2024


## WEATHER DATA


In [None]:
import pandas as pd

# Ruta al archivo CSV
meteo_csv = "meteov2.csv"

# Carga del CSV en un DataFrame
meteo = pd.read_csv(meteo_csv)

# Visualizar las primeras filas
print(meteo.head())

   tavg  prcp  snow  year  month  day  hour   wind  wind_flag
0   7.9     1     0  2020      1    1     0  21.85          0
1   7.8     1     0  2020      1    2     0  22.15          0
2   7.1     1     0  2020      1    3     0  23.60          0
3   7.9     1     0  2020      1    4     0  26.95          0
4   8.5     1     0  2020      1    5     0  28.25          0


In [None]:
meteo = pd.read_csv(meteo_csv)


In [None]:
meteo= meteo.drop(columns=['hour','wind_flag'])

In [None]:
meteo

Unnamed: 0,tavg,prcp,snow,year,month,day,wind
0,7.9,1,0,2020,1,1,21.85
1,7.8,1,0,2020,1,2,22.15
2,7.1,1,0,2020,1,3,23.60
3,7.9,1,0,2020,1,4,26.95
4,8.5,1,0,2020,1,5,28.25
...,...,...,...,...,...,...,...
1905,13.1,0,0,2025,3,20,35.35
1906,13.1,0,0,2025,3,21,30.95
1907,12.2,1,0,2025,3,22,31.40
1908,11.6,1,0,2025,3,23,24.15


In [None]:
df_merged = pd.merge(df, meteo, on=['year', 'month', 'day'], how='inner')
df_merged

Unnamed: 0,station_id,year,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,ctx-0,tavg,prcp,snow,wind
0,1,2021,1,1,0,0.043478,0.045290,0.050725,0.063406,0.119565,7.2,1,0,20.35
1,1,2021,1,1,1,0.045290,0.050725,0.063406,0.119565,0.108696,7.2,1,0,20.35
2,1,2021,1,1,2,0.050725,0.063406,0.119565,0.108696,0.108696,7.2,1,0,20.35
3,1,2021,1,1,3,0.063406,0.119565,0.108696,0.108696,0.108696,7.2,1,0,20.35
4,1,2021,1,1,4,0.119565,0.108696,0.108696,0.108696,0.108696,7.2,1,0,20.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8846076,99,2022,12,31,19,0.468254,0.657738,0.615079,0.686508,0.603175,14.2,1,0,18.90
8846077,99,2022,12,31,20,0.657738,0.615079,0.686508,0.603175,0.571429,14.2,1,0,18.90
8846078,99,2022,12,31,21,0.615079,0.686508,0.603175,0.571429,0.567460,14.2,1,0,18.90
8846079,99,2022,12,31,22,0.686508,0.603175,0.571429,0.567460,0.531746,14.2,1,0,18.90


In [None]:
X_train = df_merged.drop(columns=['ctx-0','year'])
y_train = df_merged['ctx-0']


In [None]:
X_test=pd.merge(X_test, meteo, on=['year', 'month', 'day'], how='inner')
X_test

Unnamed: 0,index,station_id,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,year,tavg,prcp,snow,wind
0,0,1,6,1,3,0.490942,0.378623,0.324275,0.311594,2024,18.7,1,0,24.80
1,1,1,6,1,8,0.271739,0.311594,0.346014,0.394928,2024,18.7,1,0,24.80
2,2,1,6,1,13,0.538043,0.650362,0.697464,0.721014,2024,18.7,1,0,24.80
3,3,1,6,1,18,0.789855,0.800725,0.791667,0.807971,2024,18.7,1,0,24.80
4,4,1,6,1,23,0.860507,0.871377,0.817029,0.793478,2024,18.7,1,0,24.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401506,401506,496,12,31,2,0.865741,0.643519,0.597222,0.921296,2024,7.6,1,0,21.65
401507,401507,496,12,31,7,1.000000,0.388889,0.375000,0.407407,2024,7.6,1,0,21.65
401508,401508,496,12,31,12,0.310185,0.337963,0.402778,0.495370,2024,7.6,1,0,21.65
401509,401509,496,12,31,17,0.462963,0.564815,0.629630,0.583333,2024,7.6,1,0,21.65


In [None]:
# Eliminar la columna "Unnamed: 0" de X_train
#X_train = X_train.drop(columns=['Unnamed: 0','year'])
X_test = X_test.drop(columns=['year'])


# Establecer la columna "index" como índice en X_test


In [None]:
X_test = X_test.set_index('index')


In [None]:
X_test


Unnamed: 0_level_0,station_id,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,tavg,prcp,snow,wind
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,6,1,3,0.490942,0.378623,0.324275,0.311594,18.7,1,0,24.80
1,1,6,1,8,0.271739,0.311594,0.346014,0.394928,18.7,1,0,24.80
2,1,6,1,13,0.538043,0.650362,0.697464,0.721014,18.7,1,0,24.80
3,1,6,1,18,0.789855,0.800725,0.791667,0.807971,18.7,1,0,24.80
4,1,6,1,23,0.860507,0.871377,0.817029,0.793478,18.7,1,0,24.80
...,...,...,...,...,...,...,...,...,...,...,...,...
401506,496,12,31,2,0.865741,0.643519,0.597222,0.921296,7.6,1,0,21.65
401507,496,12,31,7,1.000000,0.388889,0.375000,0.407407,7.6,1,0,21.65
401508,496,12,31,12,0.310185,0.337963,0.402778,0.495370,7.6,1,0,21.65
401509,496,12,31,17,0.462963,0.564815,0.629630,0.583333,7.6,1,0,21.65


# MODELS

## LINEAR REGRESSION

In [None]:
simple_model = LinearRegression()
simple_model.fit(X_train, y_train)

In [None]:
y_pred = simple_model.predict(X_test)

In [None]:
y_pred

In [None]:
y_pred_end = y_pred.ravel()

# Crear el DataFrame utilizando el índice de X_test
df_output = pd.DataFrame({
    "percentage_docks_available": y_pred_end,
    "index": X_test.index
})


df_output


In [None]:
df_output.to_csv("submit_weath_v3.csv", index=False)


## RED NEURONAL

In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
from google.colab import files
from sklearn.preprocessing import OneHotEncoder

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

# Escalar y aplicar PCA a X_train
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

pca = PCA(n_components=0.95)  # Mantener el 95% de la varianza explicada
X_train_scaled = pca.fit_transform(X_train_scaled)

# Transformar X_test con el mismo scaler y PCA
X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)

# Crear y compilar el modelo de red neuronal
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
])

# Optimizador con decaimiento exponencial del learning rate
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=1000,
    decay_rate=0.9
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(optimizer=optimizer, loss='mean_squared_error')

# Callback: parar si no mejora tras 3 épocas
early_stop = EarlyStopping(monitor='loss', patience=2, restore_best_weights=True)

# Entrenar el modelo
model.fit(
    X_train_scaled,
    y_train,
    epochs=50,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

# Predicción sobre X_test
y_pred = model.predict(X_test_pca)
print(y_pred)


#RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_RF = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_RF.fit(X_train, y_train)


# XGRADIENT BOSTING

# XGRADIENT BOSTING 1

In [None]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

# Estandarización
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Entrenamiento del modelo
model_X = XGBRegressor(n_estimators=350, learning_rate=0.05, max_depth=10)
model_X.fit(X_train_scaled, y_train)


In [None]:
y_pred = model_X.predict(X_test_scaled
                         )

In [None]:
import pickle

# Guardar el modelo entrenado
with open("model_X.pkl", "wb") as f:
    pickle.dump(model_X, f)


In [None]:
y_pred_end = y_pred.ravel()

# Crear el DataFrame utilizando el índice de X_test
df_output = pd.DataFrame({
    "percentage_docks_available": y_pred_end,
    "index": X_test.index
})


df_output
df_output.to_csv("submit_weath_v1_normaliz_X_reduced.csv", index=False)


# XGRADIENT BOSTING 2

In [None]:
!pip install xgboost



In [None]:
X_train = X_train.drop(columns=['wind'])
X_test = X_test.drop(columns=['wind'])


In [None]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

# Estandarización
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Entrenamiento del modelo
model_X = XGBRegressor(n_estimators=400, learning_rate=0.05, max_depth=12)
model_X.fit(X_train_scaled, y_train)


In [None]:
y_pred = model_X.predict(X_test_scaled)

In [None]:
y_pred_end = y_pred.ravel()

# Crear el DataFrame utilizando el índice de X_test
df_output = pd.DataFrame({
    "percentage_docks_available": y_pred_end,
    "index": X_test.index
})


df_output
df_output.to_csv("submit_weath_v1_normaliz_neww.csv", index=False)


In [None]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [None]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=15,
    random_state=42
)
model.fit(X_train_scaled, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.085620 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1817
[LightGBM] [Info] Number of data points in the train set: 8846081, number of used features: 11
[LightGBM] [Info] Start training from score 0.626299


In [None]:
y_pred = model.predict(X_test_scaled)



In [None]:
y_pred_end = y_pred.ravel()

# Crear el DataFrame utilizando el índice de X_test
df_output = pd.DataFrame({
    "percentage_docks_available": y_pred_end,
    "index": X_test.index
})


df_output
df_output.to_csv("submit_v5.csv", index=False)


# XGRADIENT BOSTING 2

In [None]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (201.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.3/201.3 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.26.2 xgboost-3.0.0


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import numpy as np

# Estandarización
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Definir el modelo base
xgb = XGBRegressor()

# Espacio de búsqueda de hiperparámetros
param_dist = {
    "n_estimators": [100, 200, 300,400],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [5, 7, 10,12],
    "subsample": [0.6, 0.8, 1.0],
}

# Búsqueda aleatoria con validación cruzada
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    verbose=1,
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    random_state=42
)

# Entrenamiento
random_search.fit(X_train_scaled, y_train)

# Mejor modelo encontrado
best_model = random_search.best_estimator_
print("Mejores hiperparámetros:", random_search.best_params_)

# (Opcional) Predicción
y_pred = best_model.predict(X_test_scaled)


Fitting 5 folds for each of 50 candidates, totalling 250 fits




KeyboardInterrupt: 

In [None]:
y_pred = y_pred.ravel()

# Crear el DataFrame utilizando el índice de X_test
df_output = pd.DataFrame({
    "percentage_docks_available": y_pred_end,
    "index": X_test.index
})


df_output
df_output.to_csv("submit_weath_v1_normaliz_X2.csv", index=False)


NameError: name 'y_pred' is not defined

In [None]:
from google.colab import files

files.download("submit_weath_v1_normaliz_X2.csv")


# OTRO

In [None]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred_end = y_pred.ravel()

# Crear el DataFrame utilizando el índice de X_test
df_output = pd.DataFrame({
    "percentage_docks_available": y_pred_end,
    "index": X_test.index
})


df_output
df_output.to_csv("submit_weath_v5_LGM.csv", index=False)
