In [None]:
import pandas as pd

data = pd.read_csv('train.csv')
print(data.head())
print(data.info())


In [None]:
print(data.isnull().sum())
# Impute or drop missing values
data = data.dropna()  # Or use imputation methods


In [None]:
# Example: One-Hot Encoding for categorical variables
data = pd.get_dummies(data, drop_first=True)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop(columns=['metastatic_diagnosis_period']))
data_scaled = pd.DataFrame(scaled_features, columns=data.columns.drop('metastatic_diagnosis_period'))
data_scaled['metastatic_diagnosis_period'] = data['metastatic_diagnosis_period']


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(data['metastatic_diagnosis_period'])
plt.show()

sns.pairplot(data)
plt.show()


In [None]:
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

X = data_scaled.drop(columns=['metastatic_diagnosis_period'])
y = data_scaled['metastatic_diagnosis_period']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Example: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
lr_rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
print(f'Linear Regression RMSE: {lr_rmse}')

# Example: Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
print(f'Random Forest RMSE: {rf_rmse}')


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
best_rf_rmse = mean_squared_error(y_test, y_pred_best_rf, squared=False)
print(f'Best Random Forest RMSE: {best_rf_rmse}')


In [None]:
import joblib

joblib.dump(best_rf_model, 'best_model.pkl')
