In [None]:
import sklearn
import time
import pa7ndas as pd   
from matplotlib import pyplot
from numpy import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, CategoricalNB
import numpy as np



In [None]:

def missing_value_counts(dataframe):
    missing_counts = dataframe.isnull().sum()
    missing_counts_df = pd.DataFrame({'Column': missing_counts.index, 'Missing_Values_Count': missing_counts.values})
    return missing_counts_df

In [None]:
def custom_impute(df):
    # Separate columns into numerical and categorical
    numerical_columns = df.select_dtypes(include=['number']).columns
    categorical_columns = df.select_dtypes(exclude=['number']).columns

    # Step 1: Impute Numerical Columns with Mean
    num_imputer = SimpleImputer(strategy='mean')
    df[numerical_columns] = num_imputer.fit_transform(df[numerical_columns])

    # Step 2: Impute Categorical Columns with Mode
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])

    return df

In [None]:
def min_max_scale_dataframe(df):
    numerical_columns = df.select_dtypes(include=['number']).columns
    scaler = MinMaxScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

    return df

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def convert_numerical_to_categorical(df, num_bins=9):
    numerical_cols = df.select_dtypes(include=['number']).columns
    df_copy = df.copy()

    # Exclude the "hospital_death" column from numerical_cols
    numerical_cols = [col for col in numerical_cols if col != "hospital_death"]

    for col in numerical_cols:
        bin_labels = [f"{col}_bin_{i}" for i in range(num_bins)]
        df_copy[col] = pd.cut(df_copy[col], bins=num_bins, labels=bin_labels)

    categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')
    new_categorical_cols = df_copy.select_dtypes(exclude=['number']).columns

    combined_categorical = pd.get_dummies(df_copy[new_categorical_cols], drop_first=True)
    df_copy.drop(numerical_cols, axis=1, inplace=True)

    df_copy = pd.concat([df_copy, combined_categorical], axis=1)

    return df_copy



In [None]:
df = pd.read_csv('train.csv')

In [None]:
df = pd.read_csv('train.csv')
imputed_df = custom_impute(df)
imputed_df



In [None]:
min_max_scale_dataframe(imputed_df)
imputed_df

In [None]:
missing_value_counts(imputed_df)

In [None]:
cat_df = convert_numerical_to_categorical(imputed_df)

In [None]:
cat_df

In [None]:
df_onehot = pd.get_dummies(cat_df)
df_onehot

In [None]:
df_onehot.columns
pd.DataFrame(df_onehot.columns)

In [None]:
X = df_onehot.loc[:, df_onehot.columns != "hospital_death"]
y = df_onehot[["hospital_death"]]

X.shape

In [None]:
model = DecisionTreeClassifier(max_depth=6, min_samples_leaf=3, min_samples_split=3)

# Fit the model to your data
model.fit(X, y)  # Use y if you have a target variable, otherwise omit it

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 20 features
selected_features = feature_importance_df['Feature'][:40]

# Create a new DataFrame with only the selected features
X_selected = X[selected_features]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [None]:
model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=3)

model.fit(X, y) 

feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

selected_features = feature_importance_df['Feature'][:500]

# X = X[selected_features]

In [None]:
X.shape

In [None]:

def roc_auc_cv(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    predicted_probabilities = cross_val_score(model, X, y, scoring="roc_auc", cv=cv, n_jobs=-1)

    return mean(predicted_probabilities)


In [None]:
def find_best_parameters(X, y):
    
    param_grid = {
        'var_smoothing': [1e-13, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 
                          0.09, 0.095, 0.097, 0.099, 0.1, 0.101, 0.103, 0.105, 0.11]
    }
    

    nb = GaussianNB()

    grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, cv=10, scoring='roc_auc')

    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_params, best_score

# find_best_parameters(X, y)


In [None]:
# find_best_parameters(X, y)

In [None]:
X.shape

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import roc_auc_score

def roc_auc_cv_cnb(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=2, random_state=1)
    y_probabilities = cross_val_predict(model, X, y, cv=cv, method='predict_proba', n_jobs=-1)
    
    # Assuming binary classification, you can select the positive class
    # probability (usually class 1)
    y_scores = y_probabilities[:, 1]

    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y, y_scores)

    return roc_auc


In [None]:
# model = GaussianNB(var_smoothing=1e-09)
model = CategoricalNB()
model = DecisionTreeClassifier(max_depth=50, min_samples_leaf=400, min_samples_split=500)

In [None]:
roc_auc_cv(model, X, y)

In [None]:
model.fit(X, y)

In [None]:
# X_test = X_test[selected_features]
X_test.shape

In [None]:
md_probs = model.predict_proba(X_test)
md_probs = md_probs[:,1]
md_auc = roc_auc_score(y_test, md_probs)
md_auc

In [None]:
def generate_predictions_for_model(model, test_file, output_file):
    
    df_test = pd.read_csv(test_file)
    record_ids = df_test["RecordID"]
    
    df_test_imputed = custom_impute(df_test)
    cat_df = convert_numerical_to_categorical(df_test_imputed)
    df_test_onehot = pd.get_dummies(cat_df)
    
    # df_test_onehot = pd.get_dummies(df_test_imputed)
    # min_max_scale_dataframe(df_test_onehot)
    
    X_test = df_test_onehot.loc[:, df_test_onehot.columns != "hospital_death"]
    # X_test = X_test[selected_features]
    # Generate predictions using the model
    probs = model.predict_proba(X_test)
    probs = probs[:, 1]
    
    # Create a DataFrame for the results
    result = pd.DataFrame({'RecordID': record_ids, 'hospital_death': probs})
    
    # Save the results to a CSV file
    result.to_csv(output_file, index=False, header=["RecordID", "hospital_death"])
    
generate_predictions_for_model(model, "test.csv", "results33.csv")
