In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Load the datasets
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

# Identify categorical and numerical columns
categorical_cols = test_features.select_dtypes(include=['object']).columns
numerical_cols = test_features.select_dtypes(exclude=['object']).columns.drop('respondent_id')

# Impute missing values
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

# OneHotEncoder for categorical columns
onehot = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Pipeline for numerical features: impute and scale
num_pipeline = Pipeline([
    ('imputer', imputer_num),
    ('scaler', StandardScaler())
])

# Pipeline for categorical features: impute and onehot encode
cat_pipeline = Pipeline([
    ('imputer', imputer_cat),
    ('onehot', onehot)
])

# Combine pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

# Preprocess the data
test_features_preprocessed = preprocessor.fit_transform(test_features)

# Assuming we have trained models, load them
# For demonstration, we'll create mock predictions
# You should replace this part with loading your actual trained models
# model_xyz = joblib.load('model_xyz.pkl')
# model_seasonal = joblib.load('model_seasonal.pkl')

# Mockup predictions for demonstration
mock_predictions_xyz = np.random.rand(test_features_preprocessed.shape[0])
mock_predictions_seasonal = np.random.rand(test_features_preprocessed.shape[0])

# Create the submission DataFrame
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': mock_predictions_xyz,
    'seasonal_vaccine': mock_predictions_seasonal
})

# Save the submission file
submission.to_csv('submission.csv', index=False)

print("Submission file created:")
print(submission.head())

# Sample output generation code
# Sample respondent IDs
respondent_ids = [1, 2, 3, 4, 5]

# Generate random probabilities for xyz_vaccine and seasonal_vaccine
np.random.seed(42)  # For reproducibility
xyz_vaccine_probs = np.random.rand(len(respondent_ids))
seasonal_vaccine_probs = np.random.rand(len(respondent_ids))

# Create the DataFrame
sample_submission = pd.DataFrame({
    'respondent_id': respondent_ids,
    'xyz_vaccine': xyz_vaccine_probs,
    'seasonal_vaccine': seasonal_vaccine_probs
})

# Save the sample submission file
sample_submission.to_csv('sample_submission.csv', index=False)

print("Sample submission file created:")
print(sample_submission)
