## Read ReadMe for instructions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgboost
import re
from sentence_transformers import SentenceTransformer
from datetime import datetime
pd.set_option('display.max_columns', None)

In [2]:
accepted_df = pd.read_csv("datasets/accepted_dataset.csv")
rejected_df = pd.read_csv("datasets/rejected_dataset.csv")

  accepted_df = pd.read_csv("datasets/accepted_dataset.csv")


## Formatting

In [41]:
# --- Step 1: Add Target Status ---
accepted_df['status'] = 1
rejected_df['status'] = 0

# --- Step 2 & 3: Rename and Select Columns (Rejected) ---
print("Preprocessing Rejected Loans...")
# Rename columns in rejected_df to match accepted_df conventions
rename_map_rejected = {
    'Amount Requested': 'loan_amnt',
    'Application Date': 'application_date_str', # Keep temporary string name
    'Loan Title': 'purpose',
    'Risk_Score': 'fico_score',          # Assuming Risk_Score maps to FICO
    'Debt-To-Income Ratio': 'dti_str',  # Keep temporary string name
    'State': 'addr_state',
    'Employment Length': 'emp_length_str', # Keep temporary string name
    # Add 'Zip Code': 'zip_code' if needed
    # Add 'Policy Code': 'policy_code' if needed
}
# Select only the columns we intend to map plus the status
cols_to_keep_rejected = list(rename_map_rejected.keys()) + ['status']
rejected_df_processed = rejected_df[cols_to_keep_rejected].copy()
rejected_df_processed.rename(columns=rename_map_rejected, inplace=True)
rejected_df_processed = rejected_df_processed.sort_index(axis=1)

print("Preprocessing Accepted Loans...")
# Select corresponding columns plus the status
# We need 'issue_d' for date, 'fico_range_low' for score, raw 'dti', raw 'emp_length'
cols_to_keep_accepted = [
    'loan_amnt',
    'issue_d',          # Use as proxy for application date
    'purpose',
    'dti',              # Raw DTI
    'addr_state',
    'emp_length',       # Raw emp_length
    'fico_range_low',   # Use low end of FICO range
    'status'
    # Add 'zip_code' if needed
    # Add 'policy_code' if needed
]
# Filter out potential columns not present in older datasets if necessary
available_cols_accepted = [col for col in cols_to_keep_accepted if col in accepted_df.columns]
accepted_df_processed = accepted_df[available_cols_accepted].copy()

# Rename accepted columns to standardized names
accepted_df_processed.rename(columns={
    'issue_d': 'application_date_str',
    'fico_range_low': 'fico_score',
    'emp_length': 'emp_length_str', # Keep temp name before cleaning
    'dti': 'dti_str'             # Keep temp name before cleaning
}, inplace=True)

accepted_df_processed = accepted_df_processed.sort_index(axis=1)

print("Standardizing data types...")
accepted_df_processed['application_date_str'] = pd.to_datetime(accepted_df_processed['application_date_str'], format='%b-%Y').dt.strftime('%Y-%m-%d')
accepted_df_processed['application_date_str'] = pd.to_datetime(accepted_df_processed['application_date_str'], errors='coerce')
rejected_df_processed['application_date_str'] = pd.to_datetime(rejected_df_processed['application_date_str'], errors='coerce')
rejected_df_processed['dti_str'] = rejected_df_processed['dti_str'].str.replace('%', '').astype(float)
emp_length_mapping = {
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10
}

accepted_df_processed['emp_length_str'] = accepted_df_processed['emp_length_str'].map(emp_length_mapping)
rejected_df_processed['emp_length_str'] = rejected_df_processed['emp_length_str'].map(emp_length_mapping)
accepted_df_processed['emp_length_str'] = accepted_df_processed['emp_length_str'].astype(float)
rejected_df_processed['emp_length_str'] = rejected_df_processed['emp_length_str'].astype(float)
accepted_df_processed['purpose'] = accepted_df_processed['purpose'].str.replace('_', ' ')
rejected_df_processed['purpose'] = rejected_df_processed['purpose'].str.replace('_', ' ')
accepted_df_processed['purpose'] = accepted_df_processed['purpose'].str.lower()
rejected_df_processed['purpose'] = rejected_df_processed['purpose'].str.lower()

accepted_final = accepted_df_processed.copy()
rejected_final = rejected_df_processed.copy()

accepted_final = accepted_final.rename(columns={'application_date_str': 'application_date', 'emp_length_str':'emp_length', 'dti_str':'dti'})
rejected_final = rejected_final.rename(columns={'application_date_str': 'application_date', 'emp_length_str':'emp_length', 'dti_str':'dti'})

# Drop rows where the application date is missing, as it's crucial for temporal split
accepted_final.dropna(subset=['application_date'], inplace=True)
rejected_final.dropna(subset=['application_date'], inplace=True)

# --- Step 7: Handle Missing Values ---
print("Handling missing values...")
print("Missing values before imputation:")
print(accepted_final.isnull().sum())
print(rejected_final.isnull().sum())
print("Dropping null")
accepted_final = accepted_final.dropna()
rejected_final = rejected_final.dropna()
# print(rejected_final.isnull().sum())

print("Splitting data by time period...")
accepted_final['year'] = accepted_final['application_date'].dt.year
rejected_final['year'] = rejected_final['application_date'].dt.year

recession_years = range(2007, 2013) # 2007-2012 inclusive
post_recession_years = range(2013, 2019) # 2013-2018 inclusive

accepted_final_recession_df = accepted_final[accepted_final['year'].isin(recession_years)].copy()
accepted_final_post_recession_df = accepted_final[accepted_final['year'].isin(post_recession_years)].copy()

rejected_final_recession_df = rejected_final[rejected_final['year'].isin(recession_years)].copy()
rejected_final_post_recession_df = rejected_final[rejected_final['year'].isin(post_recession_years)].copy()

# Drop the date and year columns as they are no longer needed for modeling itself
accepted_final_recession_df.drop(columns=['application_date'], inplace=True)
accepted_final_post_recession_df.drop(columns=['application_date'], inplace=True)
rejected_final_recession_df.drop(columns=['application_date'], inplace=True)
rejected_final_post_recession_df.drop(columns=['application_date'], inplace=True)

print(f"\nRecession Era (2007-2012) dataset shape:\n -> Accepted: {len(accepted_final_recession_df)}\n -> Rejected: {len(rejected_final_recession_df)}")
print(f"\nPost-Recession Era (2013-2018) dataset shape:\n -> Accepted: {len(accepted_final_post_recession_df)}\n -> Rejected: {len(rejected_final_post_recession_df)}")

Preprocessing Rejected Loans...
Preprocessing Accepted Loans...
Standardizing data types...
Handling missing values...
Missing values before imputation:
addr_state               0
application_date         0
dti                   1711
emp_length          146907
fico_score               0
loan_amnt                0
purpose                  0
status                   0
dtype: int64
addr_state                22
application_date           0
dti                        0
emp_length            951355
fico_score          18497630
loan_amnt                  0
purpose                 1305
status                     0
dtype: int64
Dropping null
Splitting data by time period...

Recession Era (2007-2012) dataset shape:
 -> Accepted: 92865
 -> Rejected: 723523

Post-Recession Era (2013-2018) dataset shape:
 -> Accepted: 2020783
 -> Rejected: 8269041


In [42]:
max_rows = min(len(accepted_final_recession_df), len(accepted_final_post_recession_df), len(rejected_final_recession_df), len(rejected_final_post_recession_df))
print(f"Cutting all dataframes to length '{max_rows}'")
random_shuffle_seed = 42
accepted_final_recession_df = accepted_final_recession_df.sample(frac=1, random_state=random_shuffle_seed)
accepted_final_post_recession_df = accepted_final_post_recession_df.sample(frac=1, random_state=random_shuffle_seed)
rejected_final_recession_df = rejected_final_recession_df.sample(frac=1, random_state=random_shuffle_seed)
rejected_final_post_recession_df = rejected_final_post_recession_df.sample(frac=1, random_state=random_shuffle_seed)

accepted_final_recession_df = accepted_final_recession_df.head(max_rows)
accepted_final_post_recession_df = accepted_final_post_recession_df.head(max_rows)
rejected_final_recession_df = rejected_final_recession_df.head(max_rows)
rejected_final_post_recession_df = rejected_final_post_recession_df.head(max_rows)

print(f"\nRecession Era (2007-2012) dataset shape:\n -> Accepted: {len(accepted_final_recession_df)}\n -> Rejected: {len(rejected_final_recession_df)}")
print(f"\nPost-Recession Era (2013-2018) dataset shape:\n -> Accepted: {len(accepted_final_post_recession_df)}\n -> Rejected: {len(rejected_final_post_recession_df)}")

Cutting all dataframes to length '92865'

Recession Era (2007-2012) dataset shape:
 -> Accepted: 92865
 -> Rejected: 92865

Post-Recession Era (2013-2018) dataset shape:
 -> Accepted: 92865
 -> Rejected: 92865


In [43]:
# --- Step 6: Combine DataFrames ---
print("Combining datasets...")
combined_df = pd.concat([accepted_final_recession_df, accepted_final_post_recession_df, rejected_final_recession_df, rejected_final_post_recession_df], ignore_index=True)
print(f"Combined dataset shape: {combined_df.shape}")

# --- 1. Preprocessing ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting preprocessing...")
combined_df = combined_df.reset_index(drop=True)
unique_purposes = combined_df['purpose'].unique()
print(f"[{datetime.now().strftime('%H:%M:%S')}] Found {len(unique_purposes)} unique purposes")

# --- 2. Generate Embeddings ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] Loading model...")
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

print(f"[{datetime.now().strftime('%H:%M:%S')}] Encoding {len(unique_purposes)} purposes (batch_size=512)...")
unique_embeddings = model.encode(unique_purposes, 
                               batch_size=512, 
                               show_progress_bar=True, 
                               convert_to_numpy=True)
print(f"[{datetime.now().strftime('%H:%M:%S')}] Embeddings generated: {unique_embeddings.shape}")

# --- 3. Create Embedding Dictionary ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] Creating embedding dictionary...")
embedding_dict = dict(zip(unique_purposes, unique_embeddings))

# --- 4. Map Embeddings to DataFrame ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] Mapping embeddings to {len(combined_df)} rows...")
combined_df['embedding_array'] = combined_df['purpose'].map(embedding_dict)

# Check for missing embeddings
missing = combined_df['embedding_array'].isna().sum()
print(f"[{datetime.now().strftime('%H:%M:%S')}] Missing embeddings: {missing} rows")
if missing > 0:
    combined_df['embedding_array'] = combined_df['embedding_array'].apply(
        lambda x: np.zeros(unique_embeddings.shape[1]) if x is None else x
    )

# --- 5. Convert to Columns ---
print(f"[{datetime.now().strftime('%H:%M:%S')}] Creating embedding columns...")
embedding_columns = pd.DataFrame(
    np.vstack(combined_df['embedding_array']),
    columns=[f'purpose_embed_{i}' for i in range(unique_embeddings.shape[1])],
    index=combined_df.index
)
print(f"[{datetime.now().strftime('%H:%M:%S')}] Embedding columns created: {embedding_columns.shape}")
combined_df = combined_df.drop(['purpose', 'embedding_array'], axis=1)
combined_df = pd.merge(combined_df, embedding_columns, left_index=True, right_index=True)

# One-hot encode 'addr_state'
combined_df = pd.get_dummies(
    combined_df, 
    columns=['addr_state'], 
    prefix='state', 
    dtype=np.int8  # Reduces memory usage by 75% vs float64
)

Combining datasets...
Combined dataset shape: (371460, 8)
[21:34:25] Starting preprocessing...
[21:34:25] Found 7880 unique purposes
[21:34:25] Loading model...
[21:34:34] Encoding 7880 purposes (batch_size=512)...


Batches: 100%|██████████| 16/16 [00:01<00:00,  9.50it/s]


[21:34:35] Embeddings generated: (7880, 384)
[21:34:35] Creating embedding dictionary...
[21:34:35] Mapping embeddings to 371460 rows...
[21:34:36] Missing embeddings: 0 rows
[21:34:36] Creating embedding columns...
[21:34:36] Embedding columns created: (371460, 384)


In [46]:
# --- Step 9: Extract Year and Temporal Split ---
# Define time periods
recession_years = range(2007, 2013) # 2007-2012 inclusive
post_recession_years = range(2013, 2019) # 2013-2018 inclusive

recession_df = combined_df[combined_df['year'].isin(recession_years)].copy()
post_recession_df = combined_df[combined_df['year'].isin(post_recession_years)].copy()

# Drop the date and year columns as they are no longer needed for modeling itself
recession_df.drop(columns=['year'], inplace=True)
post_recession_df.drop(columns=['year'], inplace=True)

print(f"Recession Era (2007-2012) dataset shape: {recession_df.shape}")
print(f"Post-Recession Era (2013-2018) dataset shape: {post_recession_df.shape}")

# --- Final Check ---
print("\nPreprocessing Complete.")
print("\nRecession Data Info:")
recession_df.info()
print("\nPost-Recession Data Info:")
post_recession_df.info()

Recession Era (2007-2012) dataset shape: (185730, 440)
Post-Recession Era (2013-2018) dataset shape: (185730, 440)

Preprocessing Complete.

Recession Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 185730 entries, 0 to 278594
Columns: 440 entries, dti to state_WY
dtypes: float32(384), float64(4), int64(1), int8(51)
memory usage: 289.6 MB

Post-Recession Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 185730 entries, 92865 to 371459
Columns: 440 entries, dti to state_WY
dtypes: float32(384), float64(4), int64(1), int8(51)
memory usage: 289.6 MB


## Models

In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np # Used for handling potential NaN/inf after scaling

# --- 1. Load and Prepare Data ---
# Assume 'recession_df' is your pre-loaded DataFrame
# recession_df = pd.read_csv('your_data.csv') # Or however you load it

print("Original DataFrame Info:")
recession_df.info()
print(f"\nTarget variable 'status' unique values: {recession_df['status'].unique()}")

# Define features (X) and target (y)
target_column = 'status'
X = recession_df.drop(target_column, axis=1)
y = recession_df[target_column]

# --- 2. Encode Target Variable ---
# Models require numerical target variables. If 'status' is categorical (e.g., strings),
# encode it into integers.
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"\nEncoded target classes: {le.classes_}") # Shows mapping: 0 -> class1, 1 -> class2, etc.

# --- 3. Split Data into Training and Testing Sets ---
# Use stratify=y_encoded to maintain class proportions in train/test splits,
# which is important for classification, especially if classes are imbalanced.
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.3,        # 30% for testing, 70% for training
    random_state=42,      # For reproducibility
    stratify=y_encoded    # Keep class distribution consistent
)

print(f"\nTraining set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Testing set shape: X={X_test.shape}, y={y_test.shape}")


# --- 4. Scale Features ---
# Although tree-based models aren't strictly sensitive to feature scaling,
# it's good practice and can sometimes help, especially with regularization (like in XGBoost).
# Important: Fit the scaler ONLY on the training data, then transform both train and test data.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle potential NaNs or Infs that might arise from scaling (if columns had zero variance)
X_train_scaled = np.nan_to_num(X_train_scaled)
X_test_scaled = np.nan_to_num(X_test_scaled)

# Convert scaled arrays back to DataFrames (optional, but can be helpful for inspection)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)


# --- 5. Define and Train Models ---

# --- Random Forest ---
print("\n--- Training Random Forest ---")
rf_model = RandomForestClassifier(
    n_estimators=100,     # Number of trees in the forest (common starting point)
    random_state=42,      # For reproducibility
    n_jobs=-1,            # Use all available CPU cores
    class_weight='balanced' # Useful if classes are imbalanced
)
rf_model.fit(X_train_scaled, y_train)
print("Random Forest training complete.")

# --- Bagging (Bootstrap Aggregating) ---
# Often used with Decision Trees as the base estimator (default if None is specified)
print("\n--- Training Bagging Classifier ---")
bagging_model = BaggingClassifier(
    # base_estimator=DecisionTreeClassifier(), # Default if None
    n_estimators=50,      # Number of base estimators (trees)
    random_state=42,      # For reproducibility
    n_jobs=-1             # Use all available CPU cores
)
bagging_model.fit(X_train_scaled, y_train)
print("Bagging Classifier training complete.")


# --- XGBoost ---
print("\n--- Training XGBoost Classifier ---")
xgb_model = XGBClassifier(
    n_estimators=100,     # Number of boosting rounds (trees)
    learning_rate=0.1,    # Step size shrinkage to prevent overfitting
    max_depth=3,          # Maximum depth of a tree (common starting point)
    random_state=42,      # For reproducibility
    n_jobs=-1,            # Use all available CPU cores
    use_label_encoder=False, # Recommended to avoid deprecation warnings
    eval_metric='logloss' # Common evaluation metric for binary/multi-class classification
    # Add 'scale_pos_weight' here if dealing with imbalanced classes, e.g.,
    # scale_pos_weight = sum(y_train == 0) / sum(y_train == 1) # For binary
)
xgb_model.fit(X_train_scaled, y_train)
print("XGBoost training complete.")


# --- 6. Make Predictions on Test Set ---

rf_preds = rf_model.predict(X_test_scaled)
bagging_preds = bagging_model.predict(X_test_scaled)
xgb_preds = xgb_model.predict(X_test_scaled)
# --- 7. Evaluate Models ---

models = {
    "Random Forest": rf_preds,
    "Bagging": bagging_preds,
    "XGBoost": xgb_preds
}

# Get the original class labels detected by the encoder
# These might be numbers (e.g., array([0, 1])) if your original 'status' was numeric
class_labels = le.classes_

# *** FIX: Convert class labels to strings for the report ***
# The classification_report expects string names in target_names.
class_names_str = [str(c) for c in class_labels]
print(f"\nUsing string class names for report: {class_names_str}") # Debug print

print("\n--- Model Evaluation ---")

for model_name, y_pred in models.items():
    print(f"\n--- {model_name} Results ---")

    # Calculate Metrics
    accuracy = accuracy_score(y_test, y_pred)
    # Determine averaging strategy based on the number of unique classes
    num_classes = len(class_labels)
    avg_strategy = 'weighted' if num_classes > 2 else 'binary'
    # Determine positive label ONLY for binary case (usually label 1)
    # Check if 1 exists in the encoded labels before assigning
    pos_label = 1 if avg_strategy == 'binary' and 1 in y_encoded else None

    precision = precision_score(y_test, y_pred, average=avg_strategy, pos_label=pos_label, zero_division=0)
    recall = recall_score(y_test, y_pred, average=avg_strategy, pos_label=pos_label, zero_division=0)
    f1 = f1_score(y_test, y_pred, average=avg_strategy, pos_label=pos_label, zero_division=0)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision ({avg_strategy} avg): {precision:.4f}")
    print(f"Recall ({avg_strategy} avg): {recall:.4f}")
    print(f"F1-Score ({avg_strategy} avg): {f1:.4f}")

    print("\nConfusion Matrix:")
    # Pass the actual numerical labels for the confusion matrix if needed for specific order
    print(confusion_matrix(y_test, y_pred, labels=np.unique(y_encoded))) # Ensure order if necessary

    print("\nClassification Report:")
    # *** Use the STRING version of class names for target_names ***
    print(classification_report(y_test, y_pred, target_names=class_names_str, zero_division=0))

# --- 8. Determine Best Performance (Based on your criteria) ---
# (Rest of the code for comparison remains the same, using the calculated f1_scores dict)
# Make sure the f1_score calculation here also uses the correct avg_strategy and pos_label
f1_scores = {
    "Random Forest": f1_score(y_test, rf_preds, average=avg_strategy, pos_label=pos_label, zero_division=0),
    "Bagging": f1_score(y_test, bagging_preds, average=avg_strategy, pos_label=pos_label, zero_division=0),
    "XGBoost": f1_score(y_test, xgb_preds, average=avg_strategy, pos_label=pos_label, zero_division=0)
}

best_model_name = max(f1_scores, key=f1_scores.get)
print(f"\n--- Best Performing Model (based on {avg_strategy} F1-Score) ---")
print(f"Model: {best_model_name}")
print(f"F1-Score: {f1_scores[best_model_name]:.4f}")

# --- Optional: Feature Importance (for tree-based models) ---
# Helps understand which features are most influential
if hasattr(rf_model, 'feature_importances_'):
    print("\n--- Random Forest Feature Importances (Top 10) ---")
    rf_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
    print(rf_importances.nlargest(10))

if hasattr(xgb_model, 'feature_importances_'):
    print("\n--- XGBoost Feature Importances (Top 10) ---")
    xgb_importances = pd.Series(xgb_model.feature_importances_, index=X.columns)
    print(xgb_importances.nlargest(10))

Original DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 185730 entries, 0 to 278594
Columns: 440 entries, dti to state_WY
dtypes: float32(384), float64(4), int64(1), int8(51)
memory usage: 289.6 MB

Target variable 'status' unique values: [1 0]

Encoded target classes: [0 1]

Training set shape: X=(130011, 439), y=(130011,)
Testing set shape: X=(55719, 439), y=(55719,)

--- Training Random Forest ---
Random Forest training complete.

--- Training Bagging Classifier ---


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Bagging Classifier training complete.

--- Training XGBoost Classifier ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost training complete.

Using string class names for report: ['0', '1']

--- Model Evaluation ---

--- Random Forest Results ---
Accuracy: 0.9427
Precision (binary avg): 0.9341
Recall (binary avg): 0.9527
F1-Score (binary avg): 0.9433

Confusion Matrix:
[[25987  1873]
 [ 1317 26542]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     27860
           1       0.93      0.95      0.94     27859

    accuracy                           0.94     55719
   macro avg       0.94      0.94      0.94     55719
weighted avg       0.94      0.94      0.94     55719


--- Bagging Results ---
Accuracy: 0.9583
Precision (binary avg): 0.9481
Recall (binary avg): 0.9697
F1-Score (binary avg): 0.9588

Confusion Matrix:
[[26382  1478]
 [  844 27015]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     27860
           1       0.95      0.97      0.96   