In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Read the training and test data
df = pd.read_csv('train.csv')

# Separate the target variable (hospital_death) from features
X = df.drop(columns=['hospital_death'])
y = df['hospital_death']

In [13]:


# Define numerical and categorical columns
numerical_columns = X.select_dtypes(include=np.number).columns
categorical_columns = X.select_dtypes(include='object').columns

# Create transformers for preprocessing
numerical_transformer = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(drop='first', sparse=False))
])

# Use ColumnTransformer to apply transformations to respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Create a K-Nearest Neighbors (KNN) classifier
knn_classifier = KNeighborsClassifier()

# Create a pipeline that includes preprocessing and the KNN classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', knn_classifier)
])

# Define a grid of hyperparameters to search
param_grid = {
    'classifier__n_neighbors': [1050],  # Example values, adjust as needed
    'classifier__weights': ['distance'],
    
    # Add more hyperparameters to search here
}

# Create GridSearchCV instance
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=2)

# Split the data into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)


In [14]:

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best estimator and its parameters
best_pipeline = grid_search.best_estimator_
best_params = grid_search.best_params_

# Fit the best estimator on the training data
best_pipeline.fit(X_train, y_train)




In [15]:
md_probs = best_pipeline.predict_proba(X_val)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_val, md_probs)
md_auc

0.848242454605771

In [9]:
df1 = pd.read_csv('test.csv')

In [10]:

# Make predictions on the validation set
# y_pred = best_pipeline.predict(X_val)

# Now you can use the best pipeline to make predictions on the test data
test_predictions = best_pipeline.predict(df1)

# Get probability estimates for the positive class (hospital death)
test_probabilities = best_pipeline.predict_proba(df1)[:, 1]

# Create a DataFrame with the test predictions and RecordID
test_predictions_df = pd.DataFrame({"RecordID": df1["RecordID"], "hospital_death": test_probabilities})

# Save the predictions to a CSV file
test_predictions_df.to_csv("garbar.csv", index=False)