In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib

# Load the data
data = pd.read_csv('train.csv')

# Preprocess the data
# Separate features and target variable
X = data.drop(columns=['metastatic_diagnosis_period'])
y = data['metastatic_diagnosis_period']

# Define preprocessing for numeric columns (impute missing values and scale)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical columns (impute missing values and one-hot encode)
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Define the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Save the model
joblib.dump(model, 'breast_cancer_model1.pkl')




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import numpy as np

# Load the data
data = pd.read_csv('train.csv')

# Preprocess the data
# Separate features and target variable
X = data.drop(columns=['metastatic_diagnosis_period'])
y = data['metastatic_diagnosis_period']

# Define preprocessing for numeric columns (impute missing values and scale)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical columns (impute missing values and one-hot encode)
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Define the model pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(random_state=42))])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__bootstrap': [True, False]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')

# Save the best model
joblib.dump(best_model, 'breast_cancer_model_1.pkl')


In [5]:
import pandas as pd
import joblib

# Load the trained model
model = joblib.load('breast_cancer_model.pkl')

# Load the test data
test_data = pd.read_csv('test.csv')

# Preprocess the test data
# Ensure the test data is preprocessed in the same way as the training data
# (The model pipeline already includes preprocessing steps)

# Make predictions
test_predictions = model.predict(test_data)

# Save the predictions to a new CSV file
predictions_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'metastatic_diagnosis_period': test_predictions
})

predictions_df.to_csv('test_12.csv', index=False)

print("Predictions saved to 'test_1.csv'")


Predictions saved to 'test_predictions.csv'


In [7]:
import pandas as pd
import joblib
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the trained model
model = joblib.load('breast_cancer_model1.pkl')

# Load the test data
test_data = pd.read_csv('test.csv')

# Preprocess the test data
# Ensure the test data is preprocessed in the same way as the training data
# (The model pipeline already includes preprocessing steps)

# Make predictions
test_predictions = model.predict(test_data)

# Save the predictions to a new CSV file
predictions_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'metastatic_diagnosis_period': test_predictions
})

predictions_df.to_csv('test_1.csv', index=False)

print("Predictions saved to 'test_predictions.csv'")

# Assuming you have a validation dataset with true values
# Load the validation dataset
validation_data = pd.read_csv('train.csv')

# Preprocess the validation data (same as training data)
# Ensure the validation data is preprocessed in the same way as the training data
# (The model pipeline already includes preprocessing steps)

# Extract true values
true_values = validation_data['metastatic_diagnosis_period']

# Make predictions on the validation data
validation_predictions = model.predict(validation_data)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(true_values, validation_predictions))
print(f'Root Mean Squared Error: {rmse}')


KeyError: "['Unnamed: 44'] not in index"

In [4]:
import pandas as pd
import joblib
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the trained model
model = joblib.load('breast_cancer_model1.pkl')

# Load the test data
test_data = pd.read_csv('test.csv')

# Ensure the test data is preprocessed in the same way as the training data
# (The model pipeline already includes preprocessing steps)
# Make predictions on the test data
test_predictions = model.predict(test_data)

# Save the predictions to a new CSV file
predictions_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'metastatic_diagnosis_period': test_predictions
})

predictions_df.to_csv('test_1.csv', index=False)
print("Predictions saved to 'test_1.csv'")

# Assuming you have a validation dataset with true values
# Load the validation dataset
validation_data = pd.read_csv('train.csv')

# Separate features and target variable for validation data
X_validation = validation_data.drop(columns=['metastatic_diagnosis_period'])
true_values = validation_data['metastatic_diagnosis_period']

# Make predictions on the validation data
validation_predictions = model.predict(X_validation)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(true_values, validation_predictions))
print(f'Root Mean Squared Error: {rmse}')


KeyError: "['Unnamed: 44'] not in index"