In [1]:
from sklearn import tree
import pandas as pd
import os

# Inclusion of code for Permutational importance per dan
from sklearn.inspection import permutation_importance


In [2]:
pip install pyxll-jupyter

Note: you may need to restart the kernel to use updated packages.


In [None]:
df = pd.read_csv(os.path.join("data_county_ml_v2_best.csv"))
df.head()

In [None]:
target = df["BestList"]
target_names = ["no", "yes"]

In [None]:
data = df.drop("BestList", axis=1)
feature_names = data.columns
data.head()

In [None]:
# data.replace([data.inf, -data.inf], data.nan, inplace=True)

In [None]:
data.fillna(data.mean(), inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Comparision of Calibration of Classifiers

In [None]:
print(__doc__)

# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
# License: BSD Style.

import numpy as np
np.random.seed(0)

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve

# X, y = datasets.make_classification(n_samples=100000, n_features=20,
#                                     n_informative=2, n_redundant=2)

# train_samples = 100  # Samples used for training the models

# X_train = X[:train_samples]
# X_test = X[train_samples:]
# y_train = y[:train_samples]
# y_test = y[train_samples:]

# Create classifiers
lr = LogisticRegression()
gnb = GaussianNB()
svc = LinearSVC(C=1.0)
rfc = RandomForestClassifier()


# #############################################################################
# Plot calibration plots

plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
for clf, name in [(lr, 'Logistic'),
                  (gnb, 'Naive Bayes'),
                  (svc, 'Support Vector Classification'),
                  (rfc, 'Random Forest')]:
    clf.fit(X_train, y_train)
    if hasattr(clf, "predict_proba"):
        prob_pos = clf.predict_proba(X_test)[:, 1]
    else:  # use decision function
        prob_pos = clf.decision_function(X_test)
        prob_pos = \
            (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
    fraction_of_positives, mean_predicted_value = \
        calibration_curve(y_test, prob_pos, n_bins=10)

    ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
             label="%s" % (name, ))

    ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
             histtype="step", lw=2)

ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots  (reliability curve)')

ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)

plt.tight_layout()
plt.show()

# Permutation Importance Model


In [None]:
# rf is the random forest model, for X you can use X or X_train, and the corresponding y (/y_train)
result = permutation_importance(rf, X_train, y_train, random_state=0)

In [None]:
# then you can use the importances_mean like your importances from your model
sorted(zip(result.importances_mean, feature_names), reverse=True)

## Prediction Output

In [None]:
data.to_numpy(dtype=None,copy=False)

In [None]:
rf.predict(data)

In [None]:
## rf.predict on the array that we build to predict.
