In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Read the training and test data
df = pd.read_csv('train.csv')

# Separate the target variable (hospital_death) from features
X = df.drop(columns=['hospital_death'])
y = df['hospital_death']

In [None]:


# Define numerical and categorical columns
numerical_columns = X.select_dtypes(include=np.number).columns
categorical_columns = X.select_dtypes(include='object').columns

# Create transformers for preprocessing
numerical_transformer = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(drop='first', sparse=False))
])

# Use ColumnTransformer to apply transformations to respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Create a K-Nearest Neighbors (KNN) classifier
knn_classifier = KNeighborsClassifier()

# Create a pipeline that includes preprocessing and the KNN classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', knn_classifier)
])

# Define a grid of hyperparameters to search
param_grid = {
    'classifier__n_neighbors': [1050],  # Example values, adjust as needed
    'classifier__weights': ['distance'],
    
    # Add more hyperparameters to search here
}

# Create GridSearchCV instance
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=2)

# Split the data into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)


In [None]:

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best estimator and its parameters
best_pipeline = grid_search.best_estimator_
best_params = grid_search.best_params_

# Fit the best estimator on the training data
best_pipeline.fit(X_train, y_train)


In [None]:
md_probs = best_pipeline.predict_proba(X_val)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_val, md_probs)
md_auc

In [None]:
df1 = pd.read_csv('test.csv')
df1.columns

In [None]:

# Make predictions on the validation set
# y_pred = best_pipeline.predict(X_val)

# Now you can use the best pipeline to make predictions on the test data
test_predictions = best_pipeline.predict(df1)

# Get probability estimates for the positive class (hospital death)
test_probabilities = best_pipeline.predict_proba(df1)[:, 1]

# Create a DataFrame with the test predictions and RecordID
test_predictions_df = pd.DataFrame({"RecordID": df1["RecordID"], "hospital_death": test_probabilities})

# Save the predictions to a CSV file
test_predictions_df.to_csv("garbar.csv", index=False)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import OneHotEncoder, RobustScaler, LabelEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import RandomizedSearchCV, train_test_split, RepeatedKFold, cross_val_score
from numpy import mean
from sklearn.metrics import roc_curve, roc_auc_score

def roc_auc_cv(model, X, y, njobs=2, verbose=2):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    predicted_probabilities = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=njobs, verbose=verbose)

    return mean(predicted_probabilities)


def calculate_roc_auc(model, X_test, y_test):
    md_probs = model.predict_proba(X_test)
    md_probs = md_probs[:, 1]
    md_auc = roc_auc_score(y_test, md_probs)
    return md_auc

# Function to preprocess data
def preprocess_data(data, is_train=True):
    # Separate the target variable (hospital_death) from features
    if is_train:
        X = data.drop(columns=['hospital_death'])
        y = data['hospital_death']
    else:
        X = data.copy()
        y = None

    # Create a list of numerical and categorical columns
    numerical_columns = X.select_dtypes(include=np.number).columns.tolist()
    categorical_columns = X.select_dtypes(include='object').columns.tolist()

    # Create transformers for preprocessing
    numerical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='drop')),
        # ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', RobustScaler()) ,
        # ('scaler1', StandardScaler()) ,
        

    ])

    categorical_transformer = Pipeline([
        # ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', sparse=False))
        # ('onehot', LabelEncoder())
        # ('imputer', SimpleImputer(strategy='most_frequent')),
    ])

    # Use ColumnTransformer to apply transformations to respective columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_columns),
            ('cat', categorical_transformer, categorical_columns)
        ])

    # Apply preprocessing to the data
    X = preprocessor.fit_transform(X)

    return X, y



# Read the training and test data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Preprocess the training and test data
X, y = preprocess_data(train_data, is_train=True)
X_test, _ = preprocess_data(test_data, is_train=False)
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


random_forest = RandomForestClassifier(
    n_estimators=700,
    max_depth=7, 
    random_state=0,
    n_jobs=3, min_samples_leaf=2, 
    min_samples_split=2, 
    verbose=2
)

gradient_boosting = GradientBoostingClassifier(
    random_state=42, 
    max_depth=1, 
    n_estimators=500, 
    learning_rate=0.3, 
    verbose=2
    )

xgb_classifier = XGBClassifier(
    n_estimators=800,
    max_depth=4,
    learning_rate=0.038,
    min_child_weight=1,
    colsample_bytree=0.8,
    alpha=0.1,
    random_state=42,
    gamma=0.1,
)
xgb_classifier2 = XGBClassifier(
    n_estimators=800,
    max_depth=5,
    learning_rate=0.038,
    min_child_weight=1,
    colsample_bytree=0.8,
    alpha=0.1,
    random_state=42,
    gamma=0.1,
)

catboost_classifier = CatBoostClassifier(
    iterations=500,      
    depth=5,            
    learning_rate=0.0950,   
    loss_function='Logloss',  
    eval_metric='AUC',  
    random_seed=42,
    verbose=20,
    l2_leaf_reg=2,
    border_count=80,
    leaf_estimation_iterations=30,
)
catboost_classifier2 = CatBoostClassifier(
    iterations=500,      
    depth=5,            
    learning_rate=0.0950,   
    loss_function='Logloss',  
    eval_metric='AUC',  
    random_seed=43,
    verbose=20,
    l2_leaf_reg=2,
    border_count=80,
    leaf_estimation_iterations=25,
)


# Define hyperparameter grids for tuning (you can adjust these as needed)
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2]
}

param_grid_catboost = {
    'iterations': [500, 1000],
    'depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.2]
}

# # Perform hyperparameter tuning for each classifier
# randomized_search_xgb = RandomizedSearchCV(xgb_classifier, param_distributions=param_grid_xgb, 
#                                        cv=5, scoring='roc_auc', n_jobs=-1, n_iter=10, random_state=42)
# randomized_search_xgb.fit(X_train, y_train)

# randomized_search_catboost = RandomizedSearchCV(catboost_classifier, param_distributions=param_grid_catboost, 
#                                        cv=5, scoring='roc_auc', n_jobs=-1, n_iter=10, random_state=42)
# randomized_search_catboost.fit(X_train, y_train)

# # Get the best estimators
# best_xgb_classifier = randomized_search_xgb.best_estimator_
# best_catboost_classifier = randomized_search_catboost.best_estimator_
best_xgb_classifier = xgb_classifier
best_catboost_classifier = catboost_classifier

# Create a VotingClassifier
voting_classifier = VotingClassifier(estimators=[
    ('xgb', best_xgb_classifier),
    ('catboost', best_catboost_classifier),
    
    # ('gradient', gradient_boosting),
], voting='soft')
    
    
from sklearn.ensemble import StackingClassifier

# Define the base estimators
estimators = [
    ('xgb', best_xgb_classifier),
    # ('catboost', best_catboost_classifier),
    # ('catboost2', catboost_classifier2),
    # ('xgb2', xgb_classifier2),
    # ('randomforest', random_forest)
]

# Define the stacking classifier
# stacking_classifier = StackingClassifier(
#     estimators=estimators,
#     # final_estimator=GradientBoostingClassifier(random_state=42, max_depth=1, n_estimators=500, learning_rate=0.3, verbose=2),
#     final_estimator=catboost_classifier,
#     stack_method='predict_proba',
#     passthrough=True,
#     n_jobs=3,
#     verbose=2
# )


extratrees = ExtraTreesClassifier(
        n_estimators=1000,
        max_depth=35,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=40,
        n_jobs=3,
        verbose=2,
        # max_samples=0.9,
        # bootstrap=True
    )

extratrees.fit(X_train, y_train)
calculate_roc_auc(extratrees, X_valid, y_valid)

# stacking_classifier.fit(X_train, y_train)
# calculate_roc_auc(stacking_classifier, X_valid, y_valid)

# voting_classifier.fit(X_train, y_train)
# calculate_roc_auc(voting_classifier, X_valid, y_valid)

# roc_auc_cv(stacking_classifier, X_train, y_train, njobs=2, verbose=2)



In [None]:

random_forest = RandomForestClassifier(
    n_estimators=700,
    max_depth=7, 
    random_state=0,
    n_jobs=3, min_samples_leaf=2, 
    min_samples_split=2, 
    verbose=2
)

gradient_boosting = GradientBoostingClassifier(
    random_state=42, 
    max_depth=1, 
    n_estimators=500, 
    learning_rate=0.3, 
    verbose=2
    )

xgb_classifier = XGBClassifier(
    n_estimators=800,
    max_depth=4,
    learning_rate=0.038,
    min_child_weight=1,
    colsample_bytree=0.8,
    alpha=0.1,
    random_state=42,
    gamma=0.1,
)
xgb_classifier2 = XGBClassifier(
    n_estimators=800,
    max_depth=5,
    learning_rate=0.038,
    min_child_weight=1,
    colsample_bytree=0.8,
    alpha=0.1,
    random_state=42,
    gamma=0.1,
)

catboost_classifier = CatBoostClassifier(
    iterations=500,      
    depth=5,            
    learning_rate=0.0950,   
    loss_function='Logloss',  
    eval_metric='AUC',  
    random_seed=42,
    verbose=20,
    l2_leaf_reg=2,
    border_count=80,
    leaf_estimation_iterations=30,
)
catboost_classifier2 = CatBoostClassifier(
    iterations=500,      
    depth=5,            
    learning_rate=0.0950,   
    loss_function='Logloss',  
    eval_metric='AUC',  
    random_seed=43,
    verbose=20,
    l2_leaf_reg=2,
    border_count=80,
    leaf_estimation_iterations=25,
)


# Define hyperparameter grids for tuning (you can adjust these as needed)
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2]
}

param_grid_catboost = {
    'iterations': [500, 1000],
    'depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.2]
}

# # Perform hyperparameter tuning for each classifier
# randomized_search_xgb = RandomizedSearchCV(xgb_classifier, param_distributions=param_grid_xgb, 
#                                        cv=5, scoring='roc_auc', n_jobs=-1, n_iter=10, random_state=42)
# randomized_search_xgb.fit(X_train, y_train)

# randomized_search_catboost = RandomizedSearchCV(catboost_classifier, param_distributions=param_grid_catboost, 
#                                        cv=5, scoring='roc_auc', n_jobs=-1, n_iter=10, random_state=42)
# randomized_search_catboost.fit(X_train, y_train)

# # Get the best estimators
# best_xgb_classifier = randomized_search_xgb.best_estimator_
# best_catboost_classifier = randomized_search_catboost.best_estimator_
best_xgb_classifier = xgb_classifier
best_catboost_classifier = catboost_classifier

# Create a VotingClassifier
voting_classifier = VotingClassifier(estimators=[
    ('xgb', best_xgb_classifier),
    ('catboost', best_catboost_classifier),
    
    # ('gradient', gradient_boosting),
], voting='soft')
    
    
from sklearn.ensemble import StackingClassifier

# Define the base estimators
estimators = [
    ('xgb', best_xgb_classifier),
    # ('catboost', best_catboost_classifier),
    # ('catboost2', catboost_classifier2),
    # ('xgb2', xgb_classifier2),
    # ('randomforest', random_forest)
]

# Define the stacking classifier
# stacking_classifier = StackingClassifier(
#     estimators=estimators,
#     # final_estimator=GradientBoostingClassifier(random_state=42, max_depth=1, n_estimators=500, learning_rate=0.3, verbose=2),
#     final_estimator=catboost_classifier,
#     stack_method='predict_proba',
#     passthrough=True,
#     n_jobs=3,
#     verbose=2
# )


extratrees = ExtraTreesClassifier(
        n_estimators=1000,
        max_depth=35,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=40,
        n_jobs=3,
        verbose=2,
        # max_samples=0.9,
        # bootstrap=True
    )

catboost_classifier.fit(X_train, y_train)
calculate_roc_auc(catboost_classifier, X_valid, y_valid)

# stacking_classifier.fit(X_train, y_train)
# calculate_roc_auc(stacking_classifier, X_valid, y_valid)

# voting_classifier.fit(X_train, y_train)
# calculate_roc_auc(voting_classifier, X_valid, y_valid)

# roc_auc_cv(stacking_classifier, X_train, y_train, njobs=2, verbose=2)


In [108]:
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA  # Import PCA
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from numpy import mean
from sklearn.metrics import roc_curve, roc_auc_score

def roc_auc_cv(model, X, y, njobs=2, verbose=2):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    predicted_probabilities = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=njobs, verbose=verbose)
    return mean(predicted_probabilities)

def calculate_roc_auc(model, X_test, y_test):
    md_probs = model.predict_proba(X_test)
    md_probs = md_probs[:, 1]
    md_auc = roc_auc_score(y_test, md_probs)
    return md_auc

# Function to preprocess data with PCA
def preprocess_data(data, is_train=True, num_pca_components=None):
    # Separate the target variable (hospital_death) from features
    if is_train:
        X = data.drop(columns=['hospital_death'])
        y = data['hospital_death']
    else:
        X = data.copy()
        y = None

    # Create a list of numerical and categorical columns
    numerical_columns = X.select_dtypes(include=np.number).columns.tolist()
    categorical_columns = X.select_dtypes(include='object').columns.tolist()

    # Create transformers for preprocessing
    numerical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
    ])

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', sparse=False))
    ])

    # Use ColumnTransformer to apply transformations to respective columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_columns),
            ('cat', categorical_transformer, categorical_columns)
        ])

    # Apply preprocessing to the data
    X = preprocessor.fit_transform(X)

    # Apply PCA for dimensionality reduction (if num_pca_components is specified)
    if num_pca_components:
        pca = PCA(n_components=num_pca_components)
        X = pca.fit_transform(X)

    return X, y

# Read the training and test data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

num_pca  = 35
# Preprocess the training and test data with PCA
X, y = preprocess_data(train_data, is_train=True) 
X_test, _ = preprocess_data(test_data, is_train=False)

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [182]:
from sklearn.ensemble import AdaBoostClassifier

model = CatBoostClassifier(
    iterations=450,      
    depth=5,            
    learning_rate=0.090,   
    loss_function='Logloss',  
    eval_metric='AUC',  
    random_seed=42,
    verbose=20,
    l2_leaf_reg=1,
    border_count=80,
    leaf_estimation_iterations=1,
)

model.fit(X_train, y_train)
calculate_roc_auc(model, X_valid, y_valid)
roc_auc_cv(model, X, y, njobs=2, verbose=2)


0:	total: 6.94ms	remaining: 3.12s
20:	total: 91.9ms	remaining: 1.88s
40:	total: 162ms	remaining: 1.61s
60:	total: 232ms	remaining: 1.48s
80:	total: 300ms	remaining: 1.37s
100:	total: 367ms	remaining: 1.27s
120:	total: 431ms	remaining: 1.17s
140:	total: 497ms	remaining: 1.09s
160:	total: 563ms	remaining: 1.01s
180:	total: 630ms	remaining: 937ms
200:	total: 701ms	remaining: 868ms
220:	total: 770ms	remaining: 798ms
240:	total: 837ms	remaining: 726ms
260:	total: 906ms	remaining: 656ms
280:	total: 974ms	remaining: 586ms
300:	total: 1.04s	remaining: 513ms
320:	total: 1.1s	remaining: 442ms
340:	total: 1.17s	remaining: 373ms
360:	total: 1.24s	remaining: 305ms
380:	total: 1.3s	remaining: 236ms
400:	total: 1.37s	remaining: 167ms
420:	total: 1.43s	remaining: 98.8ms
440:	total: 1.5s	remaining: 30.6ms
449:	total: 1.53s	remaining: 0us


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


0:	total: 8.87ms	remaining: 3.98s
0:	total: 6.96ms	remaining: 3.13s
20:	total: 115ms	remaining: 2.35s
20:	total: 113ms	remaining: 2.32s
40:	total: 216ms	remaining: 2.15s
40:	total: 222ms	remaining: 2.21s
60:	total: 317ms	remaining: 2.02s
60:	total: 327ms	remaining: 2.09s
80:	total: 425ms	remaining: 1.94s
80:	total: 420ms	remaining: 1.91s
100:	total: 528ms	remaining: 1.82s
100:	total: 522ms	remaining: 1.8s
120:	total: 620ms	remaining: 1.69s
120:	total: 626ms	remaining: 1.7s
140:	total: 726ms	remaining: 1.59s
140:	total: 720ms	remaining: 1.58s
160:	total: 817ms	remaining: 1.47s
160:	total: 816ms	remaining: 1.46s
180:	total: 913ms	remaining: 1.36s
180:	total: 913ms	remaining: 1.36s
200:	total: 1.01s	remaining: 1.25s
200:	total: 1.01s	remaining: 1.25s
220:	total: 1.1s	remaining: 1.14s
220:	total: 1.11s	remaining: 1.15s
240:	total: 1.2s	remaining: 1.04s
240:	total: 1.21s	remaining: 1.05s
260:	total: 1.3s	remaining: 942ms
260:	total: 1.3s	remaining: 945ms
280:	total: 1.4s	remaining: 842ms
28

[Parallel(n_jobs=2)]: Done  20 out of  20 | elapsed:   31.1s finished


0.8825966226456682

In [192]:
X.shape

(50000, 95)

In [None]:
from sklearn.ensemble import BaggingClassifier
from catboost import CatBoostClassifier

# Define the base estimator
base_estimator = CatBoostClassifier(
    iterations=450,      
    depth=5,            
    learning_rate=0.090,   
    loss_function='Logloss',  
    eval_metric='AUC',  
    random_seed=42,
    verbose=20,
    l2_leaf_reg=1,
    border_count=80,
    leaf_estimation_iterations=1,
)

# Define the bagging classifier
bagging_classifier = BaggingClassifier(
    base_estimator=base_estimator,
    n_estimators=10,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=True,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

# Fit the bagging classifier to the training data
bagging_classifier.fit(X_train, y_train)

# Evaluate the bagging classifier on the validation data
calculate_roc_auc(bagging_classifier, X_valid, y_valid)


In [203]:
import numpy as np
import pandas as pd
from sklearn.ensemble import (
    RandomForestClassifier, 
    GradientBoostingClassifier, 
    AdaBoostClassifier, 
    ExtraTreesClassifier, 
    StackingClassifier
)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
random_forest = RandomForestClassifier(
    random_state=42, 
    n_estimators=1000, 
    max_depth=11, 
    n_jobs=3, 
    verbose=2,
)

gradient_boosting = GradientBoostingClassifier(
    random_state=42,
    n_estimators=700,
    max_depth=1,
    learning_rate=0.2,
    verbose=2,
    
)

adaboost = AdaBoostClassifier(
    n_estimators=1000,
    random_state=42,
    learning_rate=0.1,
)

extra_trees = ExtraTreesClassifier(
    n_estimators=1000,
    max_depth=35,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=40,
    n_jobs=3,
    verbose=2,
)

xgb = XGBClassifier(
    n_estimators=800,
    max_depth=4,
    learning_rate=0.038,
    min_child_weight=1,
    colsample_bytree=0.8,
    alpha=0.1,
    random_state=42,
    gamma=0.5,
)

catboost = CatBoostClassifier(
    iterations=450,      
    depth=5,            
    learning_rate=0.090,   
    loss_function='Logloss',  
    eval_metric='AUC',  
    random_seed=42,
    verbose=20,
    l2_leaf_reg=1,
    border_count=80,
    leaf_estimation_iterations=1,
)


# Define the stacking classifier
estimators = [
    ('gbm', gradient_boosting),
    # ('adaboost', adaboost),
    # ('extra_trees', extra_trees),
    ('xgb', xgb),
    ('catboost', catboost),
    
]

stacking_classifier = StackingClassifier(
    estimators=estimators,
    final_estimator=random_forest  # Final estimator
)

voting_classifier = VotingClassifier(
    estimators=estimators,
    voting='soft',
    verbose=True,
)

# Fit the stacking classifier on the training data
# voting_classifier.fit(X_train, y_train)
# calculate_roc_auc(voting_classifier, X_test, y_test)
roc_auc_cv(voting_classifier, X, y, njobs=2, verbose=2)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


      Iter       Train Loss   Remaining Time       Iter       Train Loss   Remaining Time 

         1           0.5366            1.03m
         1           0.5363            1.03m
         2           0.5181            1.01m
         2           0.5180            1.02m
         3           0.4997            1.00m
         3           0.4991            1.01m
         4           0.4872            1.00m
         4           0.4928            1.01m
         5           0.4828           59.92s
         5           0.4820            1.00m
         6           0.4745            1.01m
         6           0.4741            1.01m
         7           0.4689            1.00m
         7           0.4682            1.01m
         8           0.4643           59.73s
         8           0.4638            1.00m
         9           0.4590            1.00m
         9           0.4588            1.00m
        10           0.4552            1.00m
        10           0.4561            1.00m
        

In [197]:
def generate_predictions_for_model(model, test_file, output_file):
    
    df = pd.read_csv(test_file)
    record_ids = df["RecordID"]
    
    # Preprocess the test data
    X_test, _ = preprocess_data(df, is_train=False)
        
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])


In [193]:
X.shape

(50000, 95)

In [201]:
generate_predictions_for_model(voting_classifier, "test.csv", "results105.csv")

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 359 tasks      | elapsed:    0.6s
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed:    1.0s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    1.5s finished
