In [None]:
import pandas as pd
import numpy as np
from scipy.signal import argrelextrema
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics  import balanced_accuracy_score, roc_auc_score,  make_scorer
from sklearn.model_selection import GridSearchCV #cross validation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [None]:
df = pd.read_csv('../../data/H161.csv', parse_dates = ["utc_time"])
df["utc_time"] = df["utc_time"].dt.tz_localize(None)

df.columns = ['time','phase', 'step', 'statement', 'heat_time', 'weight', 'temp', 'pressure']
df.drop(['step','statement'], axis=1, inplace=True)

In [None]:
df = df[df['phase'].str.contains ('DRYING|DISCHRG') == True]

In [None]:
df.loc[(df.weight == '????????'), 'weight'] = 0
df.loc[(df.pressure == '????????'), 'pressure'] = 0
df.loc[(df.temp == '????????'), 'temp'] = 0

In [None]:
df.dropna(inplace=True)
df.fillna(0)

In [None]:
df['temp'] = df.temp.astype(float)
df['weight'] = df.weight.astype(float)
df['pressure'] = df.pressure.astype(float)
df['heat_time'] = df.heat_time.astype(float)
df['phase'] = df.phase.astype('category')
# df.set_index('time', inplace=True)

In [None]:
n = 30  # number of points to be checked before and after

# Find local peaks
df['max_temp'] = 0
df['max_temp'] = df.iloc[argrelextrema(df.temp.values, np.greater_equal, order=n)]['temp']

In [None]:
df['ready'] = 0

In [None]:
for i in range (1, len(df['ready']-1)):
    if df.max_temp.values[i] > 0 and df.heat_time.values[i] > 10:
            while df['phase'].values[i] == 'DRYING':
                i += 1
                df.ready.values[i] = 1
df = df.drop(['max_temp'], axis=1)

KeyboardInterrupt: 

In [None]:
X = df.drop('ready', axis=1).copy()
y = df['ready'].copy()

In [None]:
# X_encoded = pd.get_dummies(X, columns=['phase']).copy()
X.drop(['time', 'phase'],axis=1, inplace=True)

In [None]:
sum(y)/len(y)

## probably  half of a heating time is unnecessary???

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [None]:
sum(y_train)/len(y_train)

In [None]:
sum(y_test)/len(y_test)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                            gamma=0.25,
                            max_depth=4,
                            seed=42)

In [None]:
clf_xgb.fit(X_train,
            y_train,
            verbose=True,
            early_stopping_rounds=10,
            eval_metric='aucpr',
            eval_set=[(X_test, y_test)])

In [None]:
plot_confusion_matrix(clf_xgb,
                      X_test,
                      y_test,
                      cmap='Blues',
                      display_labels=['Ready', 'Not Ready'])

In [None]:
bst = clf_xgb.get_booster()
for importance_type in ('weight','gain', 'cover', 'total_gain','total_cover'):
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))

In [None]:
node_params = {'shape': 'box',
               'style': 'filled, rounded',
               'fillcolor': '#78cbe'}
leaf_params = {'shape': 'box',
               'style': 'filled',
               'fillcolor': '#e48038'}

In [None]:
xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
                condition_node_params=node_params,
                leaf_node_params=leaf_params)