In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import ticker

In [None]:
fn = 'CRISISAFAR_Prediction_data.csv'

In [None]:
df = pd.read_csv(fn)

In [None]:
feature_names = df.columns[1:]

In [None]:
feature_names

In [None]:
corr_df = df[feature_names]

In [None]:
labels = df['cluster_label'].values

In [None]:
X_mat = df[feature_names].values

In [None]:
corr = corr_df.corr()
fig = plt.figure(figsize=(10, 10))

ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(corr_df.columns),1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)

plt.xticks(rotation=90)
ax.set_yticks(ticks,)
ax.set_xticklabels(feature_names,fontsize=11)
ax.set_yticklabels(feature_names,fontsize=11)
plt.tight_layout()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn import metrics
import scipy
import rfpimp

In [None]:
rf = RandomForestClassifier(n_estimators=300, oob_score=True)
number_of_folds = 3
skf = StratifiedKFold(n_splits=number_of_folds, shuffle=True)

num_of_loops=4000
run_total = num_of_loops#*number_of_folds

importance_mat = np.zeros((len(feature_names), run_total))

accuracy_list = np.zeros((run_total))
precision_list = np.zeros((run_total))
recall_list = np.zeros((run_total))

ytestlist = [] 
ypredlist = [] 

n=0
for loop_ind in range(0, num_of_loops): 
    
    X_mat_resampled, labels_resampled = resample(X_mat, labels, n_samples=len(labels), replace=True)
    
    
    for train_index, test_index in skf.split(X_mat_resampled, labels_resampled):

        X_train = X_mat_resampled[train_index, :]
        X_test = X_mat_resampled[test_index, :]
        y_train = labels_resampled[train_index]
        y_test = labels_resampled[test_index]

        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)

        imp = rfpimp.importances(rf, pd.DataFrame(X_test, columns=feature_names),\
                                 pd.DataFrame(y_test)) # permutation
        foo = imp.reindex(feature_names).values
        imp = foo.flatten()
        importance_mat[:, n] = imp
        
        
        ytestlist.append(y_test)
        ypredlist.append(y_pred)

        accuracy_list[n] = metrics.accuracy_score(y_test, y_pred)
        n = n + 1
        break
    print(n)

print("Accuracy:", np.mean(accuracy_list))

In [None]:
ytestlist = np.array(ytestlist)
ytestlist = np.concatenate(ytestlist)

ypredlist = np.array(ypredlist)
ypredlist = np.concatenate(ypredlist)

In [None]:
#true (rows), predicted (columns) 

In [None]:
cf_matrix = metrics.confusion_matrix(ytestlist, ypredlist, normalize='true')

In [None]:
plt.figure()
sns.heatmap(cf_matrix, annot=True, fmt='.2%', cmap='Blues')
plt.xlabel('true')
plt.ylabel('predicted')
plt.title('confusion matrix')

In [None]:
np.sum(cf_matrix[:, 0])

In [None]:
print(metrics.classification_report(ytestlist, ypredlist))

In [None]:
feature_names = np.array(feature_names)

In [None]:
# feature_names = np.array(full_feature_names)
feature_importance_vec = np.mean(importance_mat, 1)
feat_imp_sd = np.std(importance_mat, 1)
feat_imp_se = feat_imp_sd #/ np.sqrt(importance_mat.shape[1])


zerolist = np.zeros((len(feat_imp_se)))



inds = np.flip(np.argsort(feature_importance_vec))
sorted_feature_vals = np.flip(np.sort(feature_importance_vec))
sorted_feature_name = feature_names[inds]
print(sorted_feature_name) 

fig, ax = plt.subplots(figsize=(8, 6))
x_values = sorted_feature_vals*100
y_values = np.flip(list(range(0, len(feature_names))))

ax.barh(y_values, x_values, xerr=[zerolist, feat_imp_se*100])
# ax.errorbar(x_values, y_values, xerr=feat_imp_se*100, fmt='.', color="C0")

plt.title('Feature Importance')
plt.yticks(np.flip(list(range(0, len(feature_names)))), sorted_feature_name)
plt.xlabel('Mean OOBE')
plt.tight_layout()
plt.rcParams.update({'font.size': 12})
ax.xaxis.set_major_formatter(ticker.PercentFormatter())
plt.axvline(0.0, 0, 1,linestyle='--')
plt.show()

# plt.plot([0.5, 0.5], [0, len(sorted_feature_vals)], color='r')

plt.tight_layout()

print("sorted oob percentages: ") 
print(sorted_feature_vals*100)
print("sorted SE:")
print(feat_imp_se*100)


In [None]:
d = {"Feature_names":sorted_feature_name, "MeanOOBE":sorted_feature_vals,"SE": feat_imp_se, "SD":feat_imp_sd}
df = pd.DataFrame(d)

df.to_csv('RF_output_CRISISAFAR_Prediction_data.csv')