In [7]:
import pandas as pd

data = pd.read_csv('train.csv')
print(data.head())
print(data.info())


   patient_id patient_race  payer_type patient_state  patient_zip3   Region  \
0      268700          NaN  COMMERCIAL            AR           724    South   
1      484983        White         NaN            IL           629  Midwest   
2      277055          NaN  COMMERCIAL            CA           925     West   
3      320055     Hispanic    MEDICAID            CA           900     West   
4      190386          NaN  COMMERCIAL            CA           934     West   

             Division  patient_age patient_gender    bmi  ...  \
0  West South Central           39              F    NaN  ...   
1  East North Central           55              F  35.36  ...   
2             Pacific           59              F    NaN  ...   
3             Pacific           59              F    NaN  ...   
4             Pacific           71              F    NaN  ...   

  Average of Apr-18 Average of May-18 Average of Jun-18 Average of Jul-18  \
0             52.55             74.77             79.96  

In [8]:
# Checking for missing values
print(data.isnull().sum())

# Drop rows where the target variable is NaN
data = data.dropna(subset=['metastatic_diagnosis_period'])

# Handle missing values in features (if any)
data = data.fillna(method='ffill')  # Forward fill as an example


patient_id                        0
patient_race                   6657
payer_type                     1765
patient_state                     0
patient_zip3                      0
                               ... 
Average of Sep-18                 7
Average of Oct-18                 7
Average of Nov-18                12
Average of Dec-18                33
metastatic_diagnosis_period       0
Length: 152, dtype: int64


In [18]:
# Example: One-Hot Encoding for categorical variables
data = pd.get_dummies(data, drop_first=True)


In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop(columns=['metastatic_diagnosis_period']))
data_scaled = pd.DataFrame(scaled_features, columns=data.columns.drop('metastatic_diagnosis_period'))
data_scaled['metastatic_diagnosis_period'] = data['metastatic_diagnosis_period']


In [24]:
X = data_scaled.drop(columns=['metastatic_diagnosis_period'])
y = data_scaled['metastatic_diagnosis_period']
y.dropna(inplace=True)
y

1         33.0
2        157.0
3        146.0
4        286.0
5         73.0
         ...  
13167    179.0
13168    106.0
13169     92.0
13170      0.0
13171    330.0
Name: metastatic_diagnosis_period, Length: 13171, dtype: float64

In [21]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

# Define models
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)

# Evaluate models using cross-validation
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_squared_error')
gb_cv_scores = cross_val_score(gb_model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert to positive RMSE for readability
rf_rmse = (-rf_cv_scores.mean()) ** 0.5
gb_rmse = (-gb_cv_scores.mean()) ** 0.5

print(f'Random Forest RMSE: {rf_rmse}')
print(f'Gradient Boosting RMSE: {gb_rmse}')


Traceback (most recent call last):
  File "C:\Users\avina\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 136, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\Users\avina\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\avina\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\avina\anaconda3\Lib\site-packages\sklearn\metrics\_regression.py", line 474, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
                                          ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\avina\anaconda3\Lib\site-packages\sklearn\metrics\_regression.py", line 100, in _check_reg_targets
    y_true = check_array(y_t

Random Forest RMSE: nan
Gradient Boosting RMSE: nan


Traceback (most recent call last):
  File "C:\Users\avina\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 136, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\Users\avina\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\avina\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\avina\anaconda3\Lib\site-packages\sklearn\metrics\_regression.py", line 474, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
                                          ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\avina\anaconda3\Lib\site-packages\sklearn\metrics\_regression.py", line 100, in _check_reg_targets
    y_true = check_array(y_t

In [None]:
if rf_rmse < gb_rmse:
    best_model = rf_model
    print("Selected Random Forest Regressor")
else:
    best_model = gb_model
    print("Selected Gradient Boosting Regressor")


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

if best_model == rf_model:
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')
else:
    grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid_gb, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(f'Best Model: {best_model}')


In [None]:
final_cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
final_rmse = (-final_cv_scores.mean()) ** 0.5
print(f'Final Model RMSE: {final_rmse}')


In [None]:
# For regression, RMSE or R^2 is commonly used. Accuracy is not typically a regression metric.

In [None]:
import joblib

joblib.dump(best_model, 'best_model.pkl')

In [11]:
data.isnull().sum().sum()

1

In [13]:
data.shape

(13173, 335)

In [17]:
data.dropna(inplace=True)
data

Unnamed: 0,patient_id,patient_zip3,patient_age,bmi,population,density,age_median,age_under_10,age_10_to_19,age_20s,...,metastatic_cancer_diagnosis_code_C7962,metastatic_cancer_diagnosis_code_C7970,metastatic_cancer_diagnosis_code_C7971,metastatic_cancer_diagnosis_code_C7972,metastatic_cancer_diagnosis_code_C798,metastatic_cancer_diagnosis_code_C7981,metastatic_cancer_diagnosis_code_C7982,metastatic_cancer_diagnosis_code_C7989,metastatic_cancer_diagnosis_code_C799,metastatic_first_novel_treatment_PEMBROLIZUMAB
1,484983,629,55,35.36,2745.39,51.79,43.54,11.22,12.19,11.45,...,False,False,False,False,False,False,False,False,False,False
2,277055,925,59,35.36,38343.18,700.34,36.28,13.27,15.66,13.49,...,False,False,False,False,False,False,False,False,False,False
3,320055,900,59,35.36,36054.12,5294.33,36.65,9.76,11.27,17.23,...,False,False,False,False,False,False,False,False,False,False
4,190386,934,71,35.36,13700.37,400.48,41.78,10.03,16.43,12.97,...,False,False,False,False,False,False,False,False,False,False
5,559027,461,63,35.36,9322.89,274.74,40.12,12.23,13.88,11.53,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13168,588544,191,59,21.28,31948.46,5512.17,35.72,10.85,10.95,18.16,...,False,False,False,False,False,False,False,False,False,False
13169,393047,757,73,30.67,9309.38,204.69,40.87,11.27,14.64,12.11,...,False,False,False,False,False,False,False,False,False,False
13170,790904,928,19,30.67,39121.88,2295.94,38.20,11.88,13.35,14.23,...,False,False,False,False,False,False,False,False,True,False
13171,455518,481,52,30.67,23266.06,743.56,41.47,10.94,13.59,12.67,...,False,False,False,False,False,False,False,False,False,False
