In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.cluster import KMeans

from sklearn import linear_model
from sklearn import ensemble
from sklearn import svm
import xgboost as xgb
import lightgbm as lgbm
#import catboost as cb

#import optuna

In [50]:
data = pd.read_csv('data.csv')
ori = pd.read_csv('original.csv')
sub = pd.read_csv('sample_submission.csv')
#data = pd.concat([data, ori])
data.drop('id', inplace=True, axis=1)

In [51]:
data

Unnamed: 0,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,Thompson,tube,7.00,3770.0,0.1754,,10.8,432.0,3.6
1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8
...,...,...,...,...,...,...,...,...,...
31639,Thompson,,,1736.0,0.0886,,7.8,591.0,2.3
31640,,,13.79,,,4.7,4.7,,3.9
31641,Thompson,,18.27,658.0,-0.1224,3.0,3.0,150.0,2.3
31642,Thompson,tube,6.89,3825.0,,23.6,23.6,1972.0,3.7


In [52]:
data = data.rename(columns = {
    "pressure [MPa]": "pressure",
    "mass_flux [kg/m2-s]": "mass_flux",
    "x_e_out [-]": "x_e_out",
    "D_e [mm]": "D_e",
    "D_h [mm]": "D_h",
    "length [mm]": "length",
    "chf_exp [MW/m2]": "chf_exp"})

In [49]:
data.describe()

Unnamed: 0,pressure,mass_flux,x_e_out,D_e,D_h,length,chf_exp
count,29057.0,28718.0,23094.0,28021.0,28920.0,28750.0,33509.0
mean,10.600324,3054.674351,0.00089,8.681699,14.30288,838.070122,3.800194
std,4.333106,1770.14477,0.102455,5.273409,19.93355,676.224288,1.984091
min,0.1,0.0,-0.8667,1.0,1.0,10.0,0.8
25%,6.89,1519.0,-0.0466,5.0,5.6,318.0,2.4
50%,11.03,2729.0,0.0049,7.8,10.3,610.0,3.4
75%,13.79,4069.0,0.0682,10.8,11.5,914.0,4.6
max,20.68,7975.0,0.232,37.5,120.0,3048.0,19.3


In [35]:
data

Unnamed: 0,id,author,geometry,pressure,mass_flux,x_e_out,D_e,D_h,length,chf_exp
0,0,Thompson,tube,7.00,3770.0,0.1754,,10.8,432.0,3.6
1,1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8
...,...,...,...,...,...,...,...,...,...,...
31639,31639,Thompson,,,1736.0,0.0886,,7.8,591.0,2.3
31640,31640,,,13.79,,,4.7,4.7,,3.9
31641,31641,Thompson,,18.27,658.0,-0.1224,3.0,3.0,150.0,2.3
31642,31642,Thompson,tube,6.89,3825.0,,23.6,23.6,1972.0,3.7


In [36]:
data['author'].describe()

count        26620
unique          10
top       Thompson
freq         17396
Name: author, dtype: object

In [37]:
data[data['D_e'].isna() == True]

Unnamed: 0,id,author,geometry,pressure,mass_flux,x_e_out,D_e,D_h,length,chf_exp
0,0,Thompson,tube,7.00,3770.0,0.1754,,10.8,432.0,3.6
5,5,,,17.24,3648.0,-0.0711,,1.9,696.0,3.6
8,8,,tube,12.07,4042.0,-0.0536,,,152.0,5.6
12,12,Thompson,,6.89,7500.0,,,12.8,1930.0,4.8
18,18,Thompson,tube,,1858.0,0.0406,,10.8,432.0,3.4
...,...,...,...,...,...,...,...,...,...,...
31629,31629,Thompson,,13.79,4964.0,,,4.7,318.0,3.9
31632,31632,Thompson,tube,18.27,833.0,,,,150.0,4.1
31636,31636,,,12.07,,-0.0195,,1.9,152.0,5.4
31637,31637,Weatherhead,tube,13.79,688.0,,,11.1,457.0,2.3


In [38]:
data['author'] = data['author'].fillna('Thompson')
data['geometry'] = data['geometry'].fillna('tube')

In [39]:
data

Unnamed: 0,id,author,geometry,pressure,mass_flux,x_e_out,D_e,D_h,length,chf_exp
0,0,Thompson,tube,7.00,3770.0,0.1754,,10.8,432.0,3.6
1,1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,2,Thompson,tube,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,4,Thompson,tube,13.79,686.0,,11.1,11.1,457.0,2.8
...,...,...,...,...,...,...,...,...,...,...
31639,31639,Thompson,tube,,1736.0,0.0886,,7.8,591.0,2.3
31640,31640,Thompson,tube,13.79,,,4.7,4.7,,3.9
31641,31641,Thompson,tube,18.27,658.0,-0.1224,3.0,3.0,150.0,2.3
31642,31642,Thompson,tube,6.89,3825.0,,23.6,23.6,1972.0,3.7


In [40]:
ori

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,1,Inasaka,tube,0.39,5600,-0.1041,3.0,3.0,100,11.3
1,2,Inasaka,tube,0.31,6700,-0.0596,3.0,3.0,100,10.6
2,3,Inasaka,tube,0.33,4300,-0.0395,3.0,3.0,100,7.3
3,4,Inasaka,tube,0.62,6400,-0.1460,3.0,3.0,100,12.8
4,5,Inasaka,tube,0.64,4700,-0.0849,3.0,3.0,100,11.0
...,...,...,...,...,...,...,...,...,...,...
1860,1861,Richenderfer,plate,1.01,1500,-0.0218,15.0,120.0,10,9.4
1861,1862,Richenderfer,plate,1.01,1500,-0.0434,15.0,120.0,10,10.4
1862,1863,Richenderfer,plate,1.01,2000,-0.0109,15.0,120.0,10,10.8
1863,1864,Richenderfer,plate,1.01,2000,-0.0218,15.0,120.0,10,10.9
