In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate features and target
X_train = train_data.drop(columns=['metastatic_diagnosis_period'])
y_train = train_data['metastatic_diagnosis_period']
X_test = test_data

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numeric_cols = X_train.select_dtypes(include=['number']).columns

# Define preprocessing for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Split the training data for validation
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train_processed, y_train, test_size=0.01, random_state=42)

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr.fit(X_train_part, y_train_part)

# Evaluation
y_val_pred_gbr = gbr.predict(X_val)
mae_gbr = mean_absolute_error(y_val, y_val_pred_gbr)
mse_gbr = mean_squared_error(y_val, y_val_pred_gbr)
print(f'Gradient Boosting Regressor - MAE: {mae_gbr}, MSE: {mse_gbr}')

# Neural Network using TensorFlow
model = Sequential()
model.add(Dense(64, input_dim=X_train_processed.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Training the model
history = model.fit(X_train_part, y_train_part, validation_data=(X_val, y_val), epochs=100, batch_size=32, verbose=1)

# Evaluation
y_val_pred_nn = model.predict(X_val)
mae_nn = mean_absolute_error(y_val, y_val_pred_nn)
mse_nn = mean_squared_error(y_val, y_val_pred_nn)
print(f'Neural Network - MAE: {mae_nn}, MSE: {mse_nn}')

# Predict on test data using both models
y_test_pred_gbr = gbr.predict(X_test_processed)
y_test_pred_nn = model.predict(X_test_processed)

# Save the predictions
test_data['metastatic_diagnosis_period_pred_gbr'] = y_test_pred_gbr
test_data['metastatic_diagnosis_period_pred_nn'] = y_test_pred_nn

test_data.to_csv('test_data_with_predictions.csv', index=False)

df=pd.DataFrame()
data=pd.read_csv("test_data_with_predictions.csv")
df["patient_id"]=data["patient_id"]
df["metastatic_diagnosis_period"]=data["metastatic_diagnosis_period_pred_gbr"]
df.to_csv('test_predictions.csv', index=False)

Gradient Boosting Regressor - MAE: 64.86645567836328, MSE: 6325.89259977429


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 15856.6758 - mae: 92.1368 - val_loss: 9019.3672 - val_mae: 82.4935
Epoch 2/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8574.7637 - mae: 76.7629 - val_loss: 6700.2729 - val_mae: 68.4711
Epoch 3/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 6884.9878 - mae: 64.2058 - val_loss: 6432.4082 - val_mae: 65.6057
Epoch 4/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 6852.2939 - mae: 63.9156 - val_loss: 6331.0718 - val_mae: 64.0100
Epoch 5/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 6863.8838 - mae: 63.3560 - val_loss: 6646.0571 - val_mae: 66.6480
Epoch 6/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 6846.6709 - mae: 63.2922 - val_loss: 6380.7964 - val_mae: 64.0199
Epoch 7/100
[1m408/408[0m [32m

[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5778.9912 - mae: 57.1664 - val_loss: 7355.0186 - val_mae: 68.9305
Epoch 52/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5583.0684 - mae: 55.5477 - val_loss: 6978.8813 - val_mae: 66.2877
Epoch 53/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5683.3154 - mae: 56.8756 - val_loss: 6796.2192 - val_mae: 64.7122
Epoch 54/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5508.1187 - mae: 55.9369 - val_loss: 7075.6069 - val_mae: 63.8120
Epoch 55/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5428.6719 - mae: 55.1839 - val_loss: 6990.5347 - val_mae: 65.9679
Epoch 56/100
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5460.7549 - mae: 55.5136 - val_loss: 7011.6860 - val_mae: 65.1440
Epoch 57/100
[1m408/408[0m [32m━━━━━━━

[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
