In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
from sklearn.base import BaseEstimator, TransformerMixin
#importamos pipeline
from sklearn.pipeline import Pipeline
# importamos one hot encoder
from sklearn.preprocessing import OneHotEncoder
# importamos el imputador de variables
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
data_raw = pd.read_csv('../data/raw_data/data.csv')

# Rellenamos las variables categóricas con Thompson y tube porque son las que más se repiten, con diferencia

In [4]:
# Contamos los distintos valores de Author
data_raw['author'].value_counts()

Thompson        17396
Janssen          2716
Weatherhead      2040
Beus             1604
Peskov           1084
Williams          891
Richenderfer      545
Mortimore         197
Kossolapov        101
Inasaka            46
Name: author, dtype: int64

In [5]:
data_filled = data_raw.copy()

data_filled['author'] = data_filled['author'].fillna('Thompson')
data_filled['geometry'] = data_filled['geometry'].fillna('tube')




In [6]:
data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31644 entries, 0 to 31643
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   31644 non-null  int64  
 1   author               31644 non-null  object 
 2   geometry             31644 non-null  object 
 3   pressure [MPa]       27192 non-null  float64
 4   mass_flux [kg/m2-s]  26853 non-null  float64
 5   x_e_out [-]          21229 non-null  float64
 6   D_e [mm]             26156 non-null  float64
 7   D_h [mm]             27055 non-null  float64
 8   length [mm]          26885 non-null  float64
 9   chf_exp [MW/m2]      31644 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 2.4+ MB


# Estas variables numéricas las rellenamos con la media, ya que es más adecuado para su distribución.
### mass_flux, D_h [mm], length [mm]

In [7]:
# Rellenamos data_filled con la media de cada columna para mass_flux [kg/m2-s], D_h[mm], length [mm]
data_filled['mass_flux [kg/m2-s]'] = data_filled['mass_flux [kg/m2-s]'].fillna(data_filled['mass_flux [kg/m2-s]'].mean())
data_filled['D_h [mm]'] = data_filled['D_h [mm]'].fillna(data_filled['D_h [mm]'].mean())
data_filled['length [mm]'] = data_filled['length [mm]'].fillna(data_filled['length [mm]'].mean())


In [8]:
data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31644 entries, 0 to 31643
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   31644 non-null  int64  
 1   author               31644 non-null  object 
 2   geometry             31644 non-null  object 
 3   pressure [MPa]       27192 non-null  float64
 4   mass_flux [kg/m2-s]  31644 non-null  float64
 5   x_e_out [-]          21229 non-null  float64
 6   D_e [mm]             26156 non-null  float64
 7   D_h [mm]             31644 non-null  float64
 8   length [mm]          31644 non-null  float64
 9   chf_exp [MW/m2]      31644 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 2.4+ MB


# Estas variables numéricas las rellenamos con la moda, ya que es más adecuado para su distribución.
### D_e [mm],  pressure [MPa]


In [9]:
# Rellenamos data_filled con la media de cada columna para D_h[mm], pressure [MPa]
data_filled['D_e [mm]'] = data_filled['D_e [mm]'].fillna(data_filled['D_e [mm]'].mean())
data_filled['pressure [MPa]'] = data_filled['pressure [MPa]'].fillna(data_filled['pressure [MPa]'].mean())


In [10]:
data_filled.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31644 entries, 0 to 31643
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   31644 non-null  int64  
 1   author               31644 non-null  object 
 2   geometry             31644 non-null  object 
 3   pressure [MPa]       31644 non-null  float64
 4   mass_flux [kg/m2-s]  31644 non-null  float64
 5   x_e_out [-]          21229 non-null  float64
 6   D_e [mm]             31644 non-null  float64
 7   D_h [mm]             31644 non-null  float64
 8   length [mm]          31644 non-null  float64
 9   chf_exp [MW/m2]      31644 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 2.4+ MB


# Dividimos el dataset, quitando las que tengan null en x_e_out [-]


In [11]:
# Dividimos el dataset, dejando aparte las filas que tengan nan en x_e_out [-]
data_filled_con_nan = data_filled[data_filled['x_e_out [-]'].isna()].copy()
data_filled_sin_nan = data_filled.dropna(subset=['x_e_out [-]']).drop(columns=['id']).copy()

In [12]:
data_filled_con_nan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10415 entries, 4 to 31642
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   10415 non-null  int64  
 1   author               10415 non-null  object 
 2   geometry             10415 non-null  object 
 3   pressure [MPa]       10415 non-null  float64
 4   mass_flux [kg/m2-s]  10415 non-null  float64
 5   x_e_out [-]          0 non-null      float64
 6   D_e [mm]             10415 non-null  float64
 7   D_h [mm]             10415 non-null  float64
 8   length [mm]          10415 non-null  float64
 9   chf_exp [MW/m2]      10415 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 895.0+ KB


In [13]:
# Dividimos entre features y target
X = data_filled_sin_nan.drop('x_e_out [-]', axis = 1).drop('author', axis = 1).drop('geometry', axis = 1)
y = data_filled_sin_nan['x_e_out [-]']

# Dividimos entre train y test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)



In [14]:
# Creamos un preprocessor
# categorical_columns = ['author', 'geometry']  # Columnas categóricas
# numeric_columns = ['mass_flux [kg/m2-s]', 'D_h [mm]', 'length [mm]', 'D_e [mm]', 'pressure [MPa]', 'chf_exp [MW/m2]']  # Columnas numéricas
# dummy_encoder = OneHotEncoder(drop='first')
# scaler = StandardScaler()

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('dummy', dummy_encoder, categorical_columns),
#         ('scale', scaler, numeric_columns)
#     ],
#     remainder='passthrough'  # Pass through any other columns without transformation
# )



In [15]:
# Hacemos un pipeline con varios modelos de regresión
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

pipeline = Pipeline([
    # ('preprocessor', preprocessor),
    ('RandomForest', RandomForestRegressor()),
    # ('RandomForestRegressor', RandomForestRegressor())
    # ('SVR', SVR()),
    # ('Ridge', Ridge()),
    # ('Lasso', Lasso()),
    # ('ElasticNet', ElasticNet())
])

# Entrenamos el pipeline
pipeline.fit(X_train, y_train)


Pipeline(steps=[('RandomForest', RandomForestRegressor())])

In [16]:
y_pred = pipeline.predict(X_test)


In [17]:
y_pred

array([ 0.084709  ,  0.075518  ,  0.18069433, ..., -0.110803  ,
        0.021973  ,  0.01807125])

In [18]:
# Evaluamos con el RMSE
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')


RMSE: 0.07928563522636922


# Generar el CSV para la submission de prueba

In [19]:
# submission = pd.to_csv('../output/submissions/submission_1.csv', index=False)

In [20]:
y_pred

array([ 0.084709  ,  0.075518  ,  0.18069433, ..., -0.110803  ,
        0.021973  ,  0.01807125])