In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import lightgbm as lgb

# Load the dataset
data = pd.read_csv('../data/raw_data/data.csv')

# Separate the 'id' column
id_col = data.pop('id')

# One-Hot Encoding
data = pd.get_dummies(data, drop_first=True)

# Split the dataset into a set with missing values and a set with known values of 'x_e_out [-]'
unknown = data[data['x_e_out [-]'].isna()]
known = data.dropna(subset=['x_e_out [-]'])

# Set 'x_e_out [-]' as the target variable, and the rest of the columns as features
X_known = known.drop('x_e_out [-]', axis=1)
y_known = known['x_e_out [-]']

X_unknown = unknown.drop('x_e_out [-]', axis=1)

# Apply the imputer
imputer = SimpleImputer(strategy='median')  # or use 'mean' as per your preference
X_known_imputed = imputer.fit_transform(X_known)
X_known = pd.DataFrame(X_known_imputed, columns=X_known.columns)

X_unknown_imputed = imputer.transform(X_unknown)
X_unknown = pd.DataFrame(X_unknown_imputed, columns=X_unknown.columns)

# Split the known data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_known, y_known, test_size=0.2, random_state=42)

# Ensure valid column names for LightGBM
X_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train.columns]
X_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test.columns]
X_unknown.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_unknown.columns]

# Initialize and train the model
model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set and evaluate
preds_test = model.predict(X_test)
print('Test RMSE:', np.sqrt(mean_squared_error(y_test, preds_test)))

# Predict missing values
preds_unknown = model.predict(X_unknown)

# Combine the original id column with the predicted values for unknown 'x_e_out [-]'
predicted_unknown = pd.DataFrame({'id': id_col[unknown.index], 'x_e_out [-]': preds_unknown})

# Save the predictions to a csv file
predicted_unknown.to_csv('submission_LGBMRegressor.csv', index=False)

Test RMSE: 0.07490830634422771


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tpot import TPOTRegressor

import warnings
warnings.filterwarnings("ignore")

# Load the dataset
data = pd.read_csv('../data/raw_data/data.csv')

# Separate the 'id' column
id_col = data.pop('id')

# One-Hot Encoding
data = pd.get_dummies(data, drop_first=True)

# Split the dataset into a set with missing values and a set with known values of 'x_e_out [-]'
unknown = data[data['x_e_out [-]'].isna()]
known = data.dropna(subset=['x_e_out [-]'])

# Set 'x_e_out [-]' as the target variable, and the rest of the columns as features
X_known = known.drop('x_e_out [-]', axis=1)
y_known = known['x_e_out [-]']

X_unknown = unknown.drop('x_e_out [-]', axis=1)

# Split the known data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_known, y_known, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_unknown = scaler.transform(X_unknown)

# Initialize and train the AutoML model using TPOT
tpot = TPOTRegressor(generations=10, population_size=50, random_state=42, verbosity=2)
tpot.fit(X_train, y_train)

# Evaluate the AutoML model on the test set
preds_test = tpot.predict(X_test)
print('Test RMSE:', np.sqrt(mean_squared_error(y_test, preds_test)))

# Predict missing values using the AutoML model
preds_unknown = tpot.predict(X_unknown)

# Combine the original id column with the predicted values for unknown 'x_e_out [-]'
predicted_unknown = pd.DataFrame({'id': id_col[unknown.index], 'x_e_out [-]': preds_unknown})

# Save the predictions to a csv file
predicted_unknown.to_csv('submission_AutoML.csv', index=False)

Imputing missing values in feature set


Optimization Progress:   0%|          | 0/550 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.00560602011743376

Generation 2 - Current best internal CV score: -0.00560602011743376

Generation 3 - Current best internal CV score: -0.0055671926027358026

Generation 4 - Current best internal CV score: -0.005546412535895953

Generation 5 - Current best internal CV score: -0.005546412535895953

Generation 6 - Current best internal CV score: -0.005546412535895953

Generation 7 - Current best internal CV score: -0.005541459971641045

Generation 8 - Current best internal CV score: -0.0055289150152443515

Generation 9 - Current best internal CV score: -0.0055289150152443515

Generation 10 - Current best internal CV score: -0.0055289150152443515

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.4, min_samples_leaf=8, min_samples_split=14, n_estimators=100)
Imputing missing values in feature set
Test RMSE: 0.07487869851087404
Imputing missing values in feature set
