In [None]:
# Import libraries here
import gzip
import json
import pickle

import ipywidgets as widgets
import pandas as pd
import matplotlib.pyplot as plt

from imblearn.over_sampling import RandomOverSampler
from ipywidgets import interact
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV


In [None]:
df = pd.read_csv("data.csv")
df.head()

In [None]:
print("df shape:", df.shape)


In [None]:
nans_by_col = df.isna().sum()
print("nans_by_col shape:", nans_by_col.shape)
nans_by_col.head()

In [None]:
# Plot class balance
df["Bankrupt?"].value_counts(normalize=True).plot(kind="bar")
plt.xlabel("Bankrupt")
plt.ylabel("Frequency")
plt.title("Class Balance")

In [None]:
target = "Bankrupt?"
X = df.drop(columns=target)
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:

over_sampler = RandomOverSampler(random_state=42)
X_train_over, y_train_over = over_sampler.fit_resample(X_train,y_train)
print("X_train_over shape:", X_train_over.shape)
X_train_over.head()

In [None]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_over, y_train_over)

In [None]:
params = {
    "n_estimators":range(1,50,5),
    "max_depth":(2,5)
}

In [None]:
model = GridSearchCV(clf,param_grid=params,cv=5,n_jobs=-1,verbose=1)
model.fit(X_train_over, y_train_over)

In [None]:
cv_scores = cross_val_score(clf,X_train_over, y_train_over,cv=5,n_jobs=-1)
print(cv_scores)

In [None]:
cv_results = pd.DataFrame(model.cv_results_)
cv_results.head(5)

In [None]:
best_params = model.best_params_
print(best_params)

In [None]:
acc_train = model.score(X_train,y_train)
acc_test = model.score(X_test,y_test)

print("Model Training Accuracy:", round(acc_train, 4))
print("Model Test Accuracy:", round(acc_test, 4))

In [None]:
ConfusionMatrixDisplay.from_estimator(model,X_test,y_test)

In [None]:
class_report = classification_report(y_test, model.predict(X_test))
print(class_report)

In [None]:
importance = model.best_estimator_
importance

In [None]:
features = X_train.columns
importances = model.best_estimator_.feature_importances_
feature_importance = pd.Series(importances,index=features).sort_values()
feature_importance.tail(10).plot(kind="barh")

In [None]:
importances = model.best_estimator_.feature_importances_
importances

In [None]:
# Add make_predictions function from lesson 5.3
def make_predictions(data_filepath, model_filepath):
    # Wrangle JSON file
    X_test = wrangle(data_filepath)
    # Load model
    with open(model_filepath,"rb") as f:
        model = pickle.load(f)
    # Generate predictions
    y_test_pred = model.predict(X_test)
    # Put predictions into Series with name "bankrupt", and same index as X_test
    y_test_pred = pd.Series(y_test_pred,index=X_test.index,name="bankrupt")
    return y_test_pred

In [None]:
# Import your module
from my_predictor_assignment import make_predictions

# Generate predictions
y_test_pred = make_predictions(
    data_filepath="data/taiwan-bankruptcy-data-test-features.json.gz",
    model_filepath="model-5-5.pkl",
)

print("predictions shape:", y_test_pred.shape)
y_test_pred.head()