In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [14]:
# Load the data
data = pd.read_csv('train.csv')
data = data.head(500)

In [15]:
data.shape

(500, 152)

In [16]:
# Display basic information about the dataset
print(data.info())
print(data.describe())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Columns: 152 entries, patient_id to metastatic_diagnosis_period
dtypes: float64(137), int64(4), object(11)
memory usage: 593.9+ KB
None
          patient_id  patient_zip3  patient_age         bmi    population  \
count     500.000000     500.00000    500.00000  165.000000    500.000000   
mean   536778.472000     564.82400     60.35400   29.573273  20673.456120   
std    265504.569705     272.43729     13.80238    5.573784  14363.340726   
min    101332.000000     100.00000     20.00000   15.000000   1464.560000   
25%    293435.250000     334.00000     51.00000   25.110000   9009.570000   
50%    538105.000000     533.00000     59.50000   29.290000  18433.210000   
75%    770412.250000     804.25000     69.00000   33.000000  28471.850000   
max    993246.000000     995.00000     91.00000   40.350000  71374.130000   

            density  age_median  age_under_10  age_10_to_19     age_20s  ...  \
count    500.00000

In [17]:
# Separate features and target variable
X = data.drop(columns=['metastatic_diagnosis_period'])
y = data['metastatic_diagnosis_period']

In [18]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [19]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [20]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [21]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = RandomForestRegressor(random_state=42)

# Create the pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Define hyperparameter grid
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, 30],
    'model__min_samples_split': [2, 5, 10]
}

In [22]:
# Perform grid search
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

 'metastatic_first_novel_treatment_type']. At least one non-missing value is needed for imputation with strategy='most_frequent'.


In [23]:
# Get the best model
best_model = grid_search.best_estimator_

# Predict on test set
y_pred = best_model.predict(X_test)

 'metastatic_first_novel_treatment_type']. At least one non-missing value is needed for imputation with strategy='most_frequent'.


In [24]:
# Calculate RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 91.78401941434493


In [None]:
# Load the test data
test_df = pd.read_csv('test.csv')

In [None]:
# Ensure all specified columns exist in the DataFrame
existing_categorical_cols_test = [col for col in categorical_cols if col in test_df.columns]
existing_text_col_test = text_col if text_col in test_df.columns else None

In [None]:
# Drop columns in test data that were not used in training if any
X_test_final = test_df[existing_categorical_cols_test + X.select_dtypes(include=['int64', 'float64']).columns.tolist()]
if existing_text_col_test:
    X_test_final[existing_text_col_test] = test_df[existing_text_col_test]

test_pred = best_model.predict(X_test_final)

In [None]:
# Create a DataFrame to store the predictions along with patient_id
output_df = pd.DataFrame({
    'patient_id': test_df['patient_id'],  # Ensure 'patient_id' exists in your test.csv
    'metastatic_diagnosis_period': test_pred
})

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13173 entries, 0 to 13172
Columns: 152 entries, patient_id to metastatic_diagnosis_period
dtypes: float64(137), int64(4), object(11)
memory usage: 15.3+ MB
None
          patient_id  patient_zip3   patient_age          bmi    population  \
count   13173.000000  13173.000000  13173.000000  4102.000000  13173.000000   
mean   555441.784939    568.530859     59.271313    29.168808  20651.373928   
std    259476.503094    275.758485     13.218883     5.752820  13840.379638   
min    100043.000000    100.000000     18.000000    15.000000    635.550000   
25%    335100.000000    330.000000     50.000000    24.825000   9160.340000   
50%    555769.000000    557.000000     59.000000    28.580000  18952.780000   
75%    780967.000000    832.000000     67.000000    33.000000  30021.280000   
max    999982.000000    995.000000     91.000000    97.000000  71374.130000   

            density    age_median  age_under_10  age_10_to_19       age_20s  