In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate features and target
X_train = train_data.drop(columns=['metastatic_diagnosis_period'])
y_train = train_data['metastatic_diagnosis_period']
X_test = test_data

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numeric_cols = X_train.select_dtypes(include=['number']).columns

# Define preprocessing for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Split the training data for validation
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train_processed, y_train, test_size=0.2, random_state=42)

# Gradient Boosting Regressor (Optional, just for evaluation)
gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr.fit(X_train_part, y_train_part)

# Evaluation for Gradient Boosting Regressor
y_val_pred_gbr = gbr.predict(X_val)
mae_gbr = mean_absolute_error(y_val, y_val_pred_gbr)
mse_gbr = mean_squared_error(y_val, y_val_pred_gbr)
print(f'Gradient Boosting Regressor - MAE: {mae_gbr}, MSE: {mse_gbr}')

# Neural Network using TensorFlow
model = Sequential()
model.add(Dense(64, input_dim=X_train_processed.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Training the model
history = model.fit(X_train_part, y_train_part, validation_data=(X_val, y_val), epochs=50, batch_size=32, verbose=1)

# Evaluation for Neural Network
y_val_pred_nn = model.predict(X_val)
mae_nn = mean_absolute_error(y_val, y_val_pred_nn)
mse_nn = mean_squared_error(y_val, y_val_pred_nn)
print(f'Neural Network - MAE: {mae_nn}, MSE: {mse_nn}')

# Predict on test data using Neural Network
y_test_pred_nn = model.predict(X_test_processed)

# Create a new DataFrame for the predictions
predictions = pd.DataFrame(y_test_pred_nn, columns=['metastatic_diagnosis_period'])

# Save only the predictions to a CSV file
predictions.to_csv('test_data.csv', index=False)


Gradient Boosting Regressor - MAE: 63.76769888537362, MSE: 6784.192817870625
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 15363.1973 - mae: 89.4006 - val_loss: 10567.4424 - val_mae: 88.4817
Epoch 2/50
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9023.1553 - mae: 79.9695 - val_loss: 7356.6821 - val_mae: 67.9392
Epoch 3/50
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 7169.5459 - mae: 66.5959 - val_loss: 6983.6821 - val_mae: 65.6881
Epoch 4/50
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 6979.1240 - mae: 65.1762 - val_loss: 7003.9785 - val_mae: 64.8070
Epoch 5/50
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 6779.8687 - mae: 62.6545 - val_loss: 6872.8379 - val_mae: 64.0819
Epoch 6/50
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 6785.0278 - mae: 63.2320 - val_loss: 6871.0952 - val_mae: 62.6976
Epoch 7/50
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━

[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [3]:
pip install torch torchvision


Collecting torchNote: you may need to restart the kernel to use updated packages.


ERROR: Cannot uninstall 'TBB'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.



  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/2a/b7/a3cf5fd40334b9785cc83ee0c96b50603026eb3aa70210a33729018e7029/torch-2.3.0-cp311-cp311-win_amd64.whl.metadata
  Downloading torch-2.3.0-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting torchvision
  Obtaining dependency information for torchvision from https://files.pythonhosted.org/packages/12/c2/7c89c62f2b0a606070aa7cdb8af8af0c905562942778ebdd77600642c3b9/torchvision-0.18.0-cp311-cp311-win_amd64.whl.metadata
  Downloading torchvision-0.18.0-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Obtaining dependency information for typing-extensions>=4.8.0 from https://files.pythonhosted.org/packages/e1/4d/d612de852a0bc64a64418e1cef25fe1914c5b1611e34cc271ed7e36174c8/typing_extensions-4.12.0-py3-none-any.whl.metadata
  Downloading typing_extensions-4.12.0-py3-none-any.whl.metadata (3.0 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Obtainin

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from torch.utils.data import DataLoader, TensorDataset

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Preprocess the data
categorical_cols = [cname for cname in train_data.columns if train_data[cname].dtype == "object"]
numerical_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]

if 'target' in numerical_cols:
    numerical_cols.remove('target')

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

X = train_data.drop('target', axis=1)
y = train_data['target']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

X_train = preprocessor.fit_transform(X_train).astype(np.float32)
X_valid = preprocessor.transform(X_valid).astype(np.float32)
X_test = preprocessor.transform(test_data).astype(np.float32)
y_train = y_train.values.astype(np.float32)
y_valid = y_valid.values.astype(np.float32)

train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
valid_dataset = TensorDataset(torch.tensor(X_valid), torch.tensor(y_valid))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 128)
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

model = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1, 1))
        loss.backward()
        optimizer.step()

    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for inputs, targets in valid_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets.view(-1, 1))
            valid_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {valid_loss/len(valid_loader):.4f}')

# Predict on test data
model.eval()
with torch.no_grad():
    test_tensor = torch.tensor(X_test)
    predictions = model(test_tensor).numpy().flatten()

# Output predictions
test_data['predicted_target'] = predictions
test_data.to_csv('test_pred.csv', index=False)
