In [1]:
from pandas import read_csv, DataFrame, unique
from numpy import ravel
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # binary predictor if continuous use RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV

In [None]:
# useful code snippets for Random Forest Modelling and others??

# make a list of predictor names for later labelling
ewr_X_names = list(ewr_predictors)

# NB stratify on response as if it is skewed like so:
# stratify on response as it is skewed 24% to 76%
ewr_X_train, ewr_X_test, ewr_y_train, ewr_y_test = \
    train_test_split(ewr_predictors, ewr_response, test_size = 0.2, 
    random_state = 42,
    stratify = ewr_response)

# Use a dummy classifier/regressor for the baseline:
clf_dummy = DummyClassifier(random_state=42)
clf_dummy.fit(ewr_X_train, ewr_y_train)

# model metrics for baseline model (rewrite as a function and move output to a log instead of print file)
print("Accuracy", (clf_dummy.score(ewr_X_test, ewr_y_test)) * 100)

ewr_y_pred_test = clf_dummy.predict(ewr_X_test)
ewr_y_pred_train = clf_dummy.predict(ewr_X_train)

print("MAE train", mean_absolute_error(ewr_y_train.astype('int'),
                                        ewr_y_pred_train.astype('int')))
print("MAE test", mean_absolute_error(ewr_y_test.astype('int'),
                                         ewr_y_pred_test.astype('int')))

# confusion matrix for model - rewrite as function and move output to folder
mat = confusion_matrix(ewr_y_test, ewr_y_pred_test)
sns.heatmap(mat.T, square = True, annot = True, fmt = 'd', cbar = False)
plt.xlabel('true label')
plt.ylabel('predicted label');

# if interested in the features which impact on the model look at the below:

# make feature importance plot
features = ewr_X_names
importances = ewr_model_final.feature_importances_
indices = np.argsort(importances)

# customized number 
num_features = 15 

plt.figure(figsize=(5, 4))
plt.title('Feature Importances')

# only plot the customized number of features
plt.barh(range(num_features), importances[indices[-num_features:]], color='#49006a', align='center')
plt.yticks(range(num_features), [features[i] for i in indices[-num_features:]])
plt.xlabel('Relative Importance')
plt.show()

# run a permutation_importance_plot to remove relationships which may be present between predictor features.

result = permutation_importance(ewr_model_final, ewr_X_test, ewr_y_test, n_repeats=10,
random_state=42, n_jobs=2)

for i in result.importances_mean.argsort()[::-1]: 
    if result.importances_mean[i] - 2 * result.importances_std[i] > 0:
        print(f"{ewr_X_names[i]:<8}"
        f"{result.importances_mean[i]:.3f}"
        f" +/- {result.importances_std[i]:.3f}")

# make permutation importance plot
features = ewr_X_names
importances = result.importances_mean
indices = np.argsort(importances)

# customized number 
num_features = 15 

plt.figure(figsize=(5, 4))
plt.title('Permutation Feature Importances')

# only plot the customized number of features
plt.barh(range(num_features), importances[indices[-num_features:]], color='#49006a', align='center')
plt.yticks(range(num_features), [features[i] for i in indices[-num_features:]])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
# Change print statements to logging statements in below then it is good!
def model_metrics(input_pipe):
    print("Training Accuracy", (input_pipe.score(X_train, y_train)) * 100)
    print("Test Accuracy", (input_pipe.score(X_test, y_test)) * 100)

    y_pred_test = input_pipe.predict(X_test)
    y_pred_train = input_pipe.predict(X_train)
    print("MAE train", mean_absolute_error(y_train.astype('int'),
                                        y_pred_train.astype('int')))
    print("MAE test", mean_absolute_error(y_test.astype('int'),
                                         y_pred_test.astype('int')))
    print("AUC train", roc_auc_score(y_train, y_pred_train))
    print("AUC test", roc_auc_score(y_test, y_pred_test))

In [None]:
# code for combined ROC plot

plt.figure(0).clf()

# add pipes for ROC

pipes = [
    {
        'label':'Dummy Classifier', 
        'pipe': dummy_pipe, 
    }, 
    {
        'label':'Random Forest', 
        'pipe': rf_pipe,
    },
    {
        'label':'XGBoost', 
        'pipe': xgb_pipe,
    },
    {
        'label':'XGBoost + parameters', 
        'pipe': xgb_param_pipe,
    }
]

# iterate through pipelist

for p in pipes:
    pipe = p['pipe']
    y_pred=pipe.predict(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, pipe.predict_proba(X_test)[:,1])
    auc = roc_auc_score(y_test, y_pred)
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (p['label'], auc))
    
plt.plot([0, 1], [0, 1],'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show() 

In [None]:
shipping_data = read_csv("")