In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, train_test_split

# Define data file paths
train_file_path = 'C:/Users/DELL/Desktop/IML/Kaggle Competition 1/Data/New Format/train.xlsx'
test_file_path = 'C:/Users/DELL/Desktop/IML/Kaggle Competition 1/Data/New Format/test.xlsx'

# Function to preprocess data


def preprocess_data(data, is_train=True):
    # Separate the target variable (hospital_death) from features
    if is_train:
        X = data.drop(columns=['hospital_death'])
        y = data['hospital_death']
    else:
        X = data.copy()
        y = None

    # Create a list of numerical and categorical columns
    numerical_columns = X.select_dtypes(include=np.number).columns.tolist()
    categorical_columns = X.select_dtypes(include='object').columns.tolist()

    # Create transformers for preprocessing
    numerical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
    ])

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', sparse=False))
    ])

    # Use ColumnTransformer to apply transformations to respective columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_columns),
            ('cat', categorical_transformer, categorical_columns)
        ])

    # Apply preprocessing to the data
    X = preprocessor.fit_transform(X)

    return X, y


# Read the training and test data
train_data = pd.read_excel(train_file_path)
test_data = pd.read_excel(test_file_path)

# Preprocess the training and test data
X_train, y_train = preprocess_data(train_data, is_train=True)
X_test, _ = preprocess_data(test_data, is_train=False)

# Define XGBoost and CatBoost classifiers with hyperparameter tuning using RandomizedSearchCV
xgb_classifier = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

catboost_classifier = CatBoostClassifier(
    verbose=100,
    random_seed=42
)

# Define hyperparameter grids for tuning (you can adjust these as needed)
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2]
}

param_grid_catboost = {
    'iterations': [500, 1000],
    'depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.2]
}

# Perform hyperparameter tuning for each classifier
randomized_search_xgb = RandomizedSearchCV(xgb_classifier, param_distributions=param_grid_xgb,
                                           cv=5, scoring='roc_auc', n_jobs=-1, n_iter=10, random_state=42)
randomized_search_xgb.fit(X_train, y_train)

randomized_search_catboost = RandomizedSearchCV(catboost_classifier, param_distributions=param_grid_catboost,
                                                cv=5, scoring='roc_auc', n_jobs=-1, n_iter=10, random_state=42)
randomized_search_catboost.fit(X_train, y_train)

# Get the best estimators
best_xgb_classifier = randomized_search_xgb.best_estimator_
best_catboost_classifier = randomized_search_catboost.best_estimator_

# Create a VotingClassifier
voting_classifier = VotingClassifier(estimators=[
    ('xgb', best_xgb_classifier),
    ('catboost', best_catboost_classifier)
], voting='soft')  # Use 'soft' voting for weighted voting based on class probabilities

# Fit the VotingClassifier to the training data
voting_classifier.fit(X_train, y_train)

# Make predictions on the test data
test_probabilities = voting_classifier.predict_proba(X_test)[:, 1]

# Create a DataFrame with the test predictions and RecordID
test_predictions_df = pd.DataFrame(
    {"RecordID": test_data["RecordID"], "hospital_death": test_probabilities})

# Save the predictions to a CSV file
test_predictions_df.to_csv("entry 85.csv", index=False)
