In [9]:
pip install pandas scikit-learn tensorflow





In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the data
train_data = pd.read_csv('train_n.csv')
test_data = pd.read_csv('test_n.csv')

# Separate features and target
X_train = train_data.drop(columns=['metastatic_diagnosis_period'])
y_train = train_data['metastatic_diagnosis_period']
X_test = test_data

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numeric_cols = X_train.select_dtypes(include=['number']).columns

# Define preprocessing for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Split the training data for validation
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train_processed, y_train, test_size=0.2, random_state=42)

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr.fit(X_train_part, y_train_part)

# Evaluation
y_val_pred_gbr = gbr.predict(X_val)
mae_gbr = mean_absolute_error(y_val, y_val_pred_gbr)
mse_gbr = mean_squared_error(y_val, y_val_pred_gbr)
print(f'Gradient Boosting Regressor - MAE: {mae_gbr}, MSE: {mse_gbr}')

# Neural Network using TensorFlow
model = Sequential()
model.add(Dense(64, input_dim=X_train_processed.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Training the model
history = model.fit(X_train_part, y_train_part, validation_data=(X_val, y_val), epochs=100, batch_size=32, verbose=1)

# Evaluation
y_val_pred_nn = model.predict(X_val)
mae_nn = mean_absolute_error(y_val, y_val_pred_nn)
mse_nn = mean_squared_error(y_val, y_val_pred_nn)
print(f'Neural Network - MAE: {mae_nn}, MSE: {mse_nn}')

# Predict on test data using both models
y_test_pred_gbr = gbr.predict(X_test_processed)
y_test_pred_nn = model.predict(X_test_processed)

# Save the predictions
test_data['metastatic_diagnosis_period_pred_gbr'] = y_test_pred_gbr
test_data['metastatic_diagnosis_period_pred_nn'] = y_test_pred_nn

test_data.to_csv('test_data_with_predictions.csv', index=False)


Gradient Boosting Regressor - MAE: 90.38569144572486, MSE: 11467.899681353427
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 17156.7227 - mae: 93.1732 - val_loss: 12746.3623 - val_mae: 94.1963
Epoch 2/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 928us/step - loss: 12286.5195 - mae: 92.1882 - val_loss: 12421.8428 - val_mae: 93.9616
Epoch 3/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 884us/step - loss: 11760.3633 - mae: 91.3484 - val_loss: 12244.6631 - val_mae: 94.5693
Epoch 4/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 862us/step - loss: 11589.8223 - mae: 90.9004 - val_loss: 12204.7773 - val_mae: 94.5417
Epoch 5/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 872us/step - loss: 11630.9902 - mae: 91.6283 - val_loss: 12169.8242 - val_mae: 94.2711
Epoch 6/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 875us/step - loss: 12107.1895 - mae: 93.9762 - val_loss: 12226.0176 - val_mae: 93.3225
Epoch 7/100
[1m330/330

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 910us/step - loss: 11306.2256 - mae: 90.2282 - val_loss: 12262.5117 - val_mae: 93.6719
Epoch 51/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 899us/step - loss: 11278.8662 - mae: 90.0329 - val_loss: 12276.8320 - val_mae: 93.5050
Epoch 52/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 900us/step - loss: 11297.5986 - mae: 89.9424 - val_loss: 12237.9688 - val_mae: 93.4990
Epoch 53/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 900us/step - loss: 11388.6533 - mae: 90.4245 - val_loss: 12231.3096 - val_mae: 94.1478
Epoch 54/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 926us/step - loss: 11286.2539 - mae: 90.2260 - val_loss: 12243.7979 - val_mae: 94.4030
Epoch 55/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 900us/step - loss: 11214.5488 - mae: 89.6155 - val_loss: 12301.9912 - val_mae: 94.1199
Epoch 56/100
[1m

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 877us/step - loss: 10781.4551 - mae: 87.7913 - val_loss: 12364.1807 - val_mae: 93.9418
Epoch 100/100
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 910us/step - loss: 10656.7188 - mae: 86.9028 - val_loss: 12365.2471 - val_mae: 95.0640
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 897us/step
Neural Network - MAE: 95.06404699292989, MSE: 12365.246023113938
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 618us/step


In [18]:
#330/330 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 5509.1143 - mae: 55.8915 - val_loss: 7232.9844 - val_mae: 62.9635
#330/330 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 4514.6509 - mae: 50.6066 - val_loss: 8327.8076 - val_mae: 66.9452


test_data.columns

Index(['patient_id', 'patient_race', 'payer_type', 'patient_state',
       'patient_zip3', 'Region', 'Division', 'patient_age', 'patient_gender',
       'bmi',
       ...
       'Average of May-18', 'Average of Jun-18', 'Average of Jul-18',
       'Average of Aug-18', 'Average of Sep-18', 'Average of Oct-18',
       'Average of Nov-18', 'Average of Dec-18',
       'metastatic_diagnosis_period_pred_gbr',
       'metastatic_diagnosis_period_pred_nn'],
      dtype='object', length=153)

In [5]:
columns_to_keep = ['patient_id', 'metastatic_diagnosis_period_pred_nn']
df = test_data[columns_to_keep]

In [6]:
df.to_csv('p9.csv', index=False)


In [4]:
test_data.head()

Unnamed: 0,patient_id,patient_zip3,patient_age,bmi,population,density,age_median,age_under_10,age_10_to_19,age_20s,...,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18,metastatic_diagnosis_period_pred_gbr,metastatic_diagnosis_period_pred_nn
0,730681,713,55,25.45,4639.07,72.66,41.5,11.4,13.44,11.42,...,78.34,81.96,83.58,82.22,80.2,69.73,53.14,51.34,96.639889,123.042168
1,334212,283,60,40.0,10875.3,217.91,39.64,11.23,13.72,15.01,...,77.26,80.05,82.88,82.09,78.85,64.6,50.57,48.1,100.454789,98.088669
2,571362,794,54,32.33,18716.73,1019.51,30.37,11.0,18.86,23.11,...,77.79,82.45,82.44,80.77,72.16,59.31,48.25,42.13,108.008583,98.946609
3,907331,373,63,27.07,7804.79,140.05,44.32,10.19,12.66,11.7,...,71.31,75.2,76.96,75.78,74.87,61.06,44.31,42.83,98.980895,95.319977
4,208382,980,62,25.45,28628.29,1091.83,39.68,12.14,12.46,11.32,...,56.92,57.88,66.16,65.21,57.52,49.53,43.75,38.33,75.876455,66.358253


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_part, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_part.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_processed, dtype=torch.float32)

# Define the neural network architecture using PyTorch
class PyTorchModel(nn.Module):
    def __init__(self, input_size):
        super(PyTorchModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Create an instance of the model
pytorch_model = PyTorchModel(X_train_tensor.shape[1])

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(pytorch_model.parameters(), lr=0.001)

# Training the PyTorch model
def train_model(model, criterion, optimizer, X_train, y_train, X_val, y_val, epochs=100, batch_size=32):
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        # Validation loss
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, y_val.unsqueeze(1))
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss.item():.4f}")

train_model(pytorch_model, criterion, optimizer, X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor)

# Prediction using PyTorch
pytorch_model.eval()
with torch.no_grad():
    y_val_pred_pytorch = pytorch_model(X_val_tensor).numpy()
    y_test_pred_pytorch = pytorch_model(X_test_tensor).numpy()

# Evaluation
mae_pytorch = mean_absolute_error(y_val, y_val_pred_pytorch)
mse_pytorch = mean_squared_error(y_val, y_val_pred_pytorch)
print(f'PyTorch Model - MAE: {mae_pytorch}, MSE: {mse_pytorch}')

# Predict on test data using PyTorch
test_data['metastatic_diagnosis_period_pred_pytorch'] = y_test_pred_pytorch

test_data.to_csv('test_data_with_predictions.csv', index=False)


ModuleNotFoundError: No module named 'torch'

In [2]:
pip install pytorch

Note: you may need to restart the kernel to use updated packages.Collecting pytorch
  Using cached pytorch-1.0.2.tar.gz (689 bytes)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pytorch
  Building wheel for pytorch (setup.py): started
  Building wheel for pytorch (setup.py): finished with status 'error'
  Running setup.py clean for pytorch
Failed to build pytorch



  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [6 lines of output]
  Traceback (most recent call last):
    File "<string>", line 2, in <module>
    File "<pip-setuptools-caller>", line 34, in <module>
    File "C:\Users\avina\AppData\Local\Temp\pip-install-fsch3afl\pytorch_75c244ed2a10405cb99cfe8bb7587f16\setup.py", line 15, in <module>
      raise Exception(message)
  Exception: You tried to install "pytorch". The package named for PyTorch is "torch"
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for pytorch
ERROR: Could not build wheels for pytorch, which is required to install pyproject.toml-based projects
