In [9]:
import sklearn
import time
import pandas as pd   
from matplotlib import pyplot
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, CategoricalNB
import numpy as np



In [10]:

def missing_value_counts(dataframe):
    missing_counts = dataframe.isnull().sum()
    missing_counts_df = pd.DataFrame({'Column': missing_counts.index, 'Missing_Values_Count': missing_counts.values})
    return missing_counts_df

In [11]:
def robust_scale(df):
    numerical_columns = df.select_dtypes(include=['number']).columns
    scaler = RobustScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    return df

In [12]:
def knn_impute_numerical_columns(df, n_neighbors=5):
    df_imputed = df.copy()
    numerical_cols = df_imputed.select_dtypes(include=['number']).columns
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed[numerical_cols] = imputer.fit_transform(df_imputed[numerical_cols])
    return df_imputed


In [13]:
df = pd.read_csv('train.csv')

In [15]:
# robust_scale(df)

In [16]:
df_imputed = knn_impute_numerical_columns(df, n_neighbors=15)

In [17]:
# missing_value_counts(df_imputed)
df_imputed_copy = df_imputed.copy()

In [None]:
# missing_value_counts(df_imputed)

In [18]:
df_dropped = df_imputed.copy().dropna(axis=0)
df_dropped.shape
# missing_value_counts(df_dropped)

(49103, 58)

In [19]:
df_dropped.shape

(49103, 58)

In [20]:
df_onehot = pd.get_dummies(df_dropped)
df_onehot
df_onehot.shape

(49103, 96)

In [21]:
X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

X.shape

(49103, 95)

In [None]:
model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=3)

model.fit(X, y) 

feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

selected_features = feature_importance_df['Feature'][:500]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)


In [23]:
def roc_auc_cv(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    predicted_probabilities = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=-1)

    return mean(predicted_probabilities)

In [24]:
def calculate_roc_auc(model, X_test, y_test):
    md_probs = model.predict_proba(X_test)
    md_probs = md_probs[:, 1]
    md_auc = roc_auc_score(y_test, md_probs)
    return md_auc

# roc_auc = calculate_roc_auc(model, X_test, y_test)
# print(roc_auc)


In [None]:
model  = KNeighborsClassifier(n_neighbors=300)
model.fit(X_train, y_train)
calculate_roc_auc(model, X_test, y_test)
# roc_auc_cv(model, X, y)


In [None]:
model.fit(X, y)


In [None]:
knn_model = KNeighborsClassifier()

param_grid = {
    'n_neighbors': range(200, 321),  # Number of neighbors to consider
    # 'weights': ['uniform', 'distance'],  # Weighting scheme ('uniform' or 'distance')
}

# Create the Grid Search object
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Fit the Grid Search to your data
# grid_search.fit(X, y)

# Print the best hyperparameters and corresponding ROC AUC score
print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC Score:", grid_search.best_score_)


In [None]:
def find_best_n_neighbors(model, X, y):
    best_n_neighbors = 0
    best_roc_auc = 0

    for n_neighbors in range(200, 321):
        # roc_auc = calculate_roc_auc(model, X_test, y_test)
        roc_auc = roc_auc_cv(model, X, y)
        
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            best_n_neighbors = n_neighbors

    return best_n_neighbors, best_roc_auc

# Example usage:
best_n, best_roc_auc = find_best_n_neighbors(model, X, y)
print(f"Best n_neighbors: {best_n}")
print(f"Best ROC AUC CV score: {best_roc_auc}")


In [None]:
def generate_predictions_for_model(model, test_file, output_file):
    
    df_test = pd.read_csv(test_file)
    record_ids = df_test["RecordID"]
    
    df_scaled = robust_scale(df_test)
    df_test_imputed = knn_impute_numerical_columns(df_scaled, n_neighbors=10)
    # df_test_imputed = df_test_imputed.dropna(axis=0)
    
    df_test_onehot = pd.get_dummies(df_test_imputed)
    
    X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
    
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])


In [None]:
generate_predictions_for_model(model, "test.csv", "results34.csv")
