In [1]:
# to handle  datasets
import pandas as pd
import numpy as np
# for plotting
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split

# to display all the columns/rows in the dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
#hyperparameters
# DATASET='test'
DATASET='train'

HEATER = 'H160'
MAX_HEAD_PRESSURE = 400
MIN_HEAD_TEMPERATURE = 45
MIN_WEIGHT = 1400

In [3]:
# load the data
df = pd.read_csv('./data/' + DATASET + '/' +HEATER + '.csv', parse_dates=['utc_time'])
df.columns = ['time','batch','phase', 'heat_time', 'weight', 'head_temp', 'head_pressure', 'vac_temp']
df = df[df['phase'].str.contains('DRYING|HEATING|FEED|FEEDING') == True]

  df = pd.read_csv('./data/' + DATASET + '/' +HEATER + '.csv', parse_dates=['utc_time'])


In [4]:
# clean the data
df.replace({'??????': 0,'????????': 0, '???????': 0}, inplace=True)
df.dropna(inplace=True)

In [5]:
#  type conversion
df['head_temp'] = df.head_temp.astype(float)
df['weight'] = df.weight.astype(float)
df['head_pressure'] = df.head_pressure.astype(float)
df['heat_time'] = df.heat_time.astype(float).astype(int)
df['vac_temp'] = df.vac_temp.astype(float)
df['batch'] = df.batch.astype(float).astype(int)
df['phase'] = df.phase.astype('category')

In [6]:
#  label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['phase'] = le.fit_transform(df['phase'])

In [7]:
# find local maximums
from scipy.signal import argrelextrema
n = 30 # number of points to be checked before and after

df['max_temp'] = 0
df['max_temp'] = df.iloc[argrelextrema(df.head_temp.values, np.greater_equal, order=n)]['head_temp']

In [None]:
df['ready'] = 0
DELTA_TIME = 2  # time after max to be considered ready
DELTA_TEMP = 1  # temperature difference to be considered ready
df['ready'] = df.ready.astype(int)

for i in range(1, len(df) - 1):
    if df.max_temp.values[i] > 0:
        # and df.weight.values[i] > MIN_WEIGHT:
        # and df.head_temp.values[i] > MIN_HEAD_TEMPERATURE \
        # and df.head_pressure.values[i] < MAX_HEAD_PRESSURE \
        local_max = df.max_temp.values[i]
        current = df.batch.values[i]
        local_max_index = i
        while df['batch'].values[i] == current and i < len(df) - 1:
            if local_max - df.head_temp.values[i] > DELTA_TEMP and i > local_max_index + DELTA_TIME:
                df.ready.values[i] = 1
            i += 1

In [None]:
df = df.drop(['max_temp'], axis=1)

In [None]:
df['head_pressure_20'] = df['head_pressure'].shift(-20)
df['head_temp_20'] = df['head_temp'].shift(-20)
df['weight_20'] = df['weight'].shift(-20)
df['heat_time_20'] = df['heat_time'].shift(-20)
df['vac_temp_20'] = df['vac_temp'].shift(-20)
df['phase_20'] = df['phase'].shift(-20)

df['head_pressure_30'] = df['head_pressure'].shift(-30)
df['head_temp_30'] = df['head_temp'].shift(-30)
df['weight_30'] = df['weight'].shift(-30)
df['heat_time_30'] = df['heat_time'].shift(-30)
df['vac_temp_30'] = df['vac_temp'].shift(-30)
df['phase_30'] = df['phase'].shift(-30)

df['head_pressure_35'] = df['head_pressure'].shift(-35)
df['head_temp_35'] = df['head_temp'].shift(-35)
df['weight_35'] = df['weight'].shift(-35)
df['heat_time_35'] = df['heat_time'].shift(-35)
df['vac_temp_35'] = df['vac_temp'].shift(-35)
df['phase_35'] = df['phase'].shift(-35)

df['head_pressure_40'] = df['head_pressure'].shift(-40)
df['head_temp_40'] = df['head_temp'].shift(-40)
df['weight_40'] = df['weight'].shift(-40)
df['heat_time_40'] = df['heat_time'].shift(-40)
df['vac_temp_40'] = df['vac_temp'].shift(-40)
df['phase_40'] = df['phase'].shift(-40)
df.dropna(inplace=True)

In [None]:
features = [
            'batch',
            'phase','heat_time','weight', 'head_temp', 'head_pressure', 'vac_temp',
            'phase_20','heat_time_20', 'weight_20', 'head_temp_20', 'head_pressure_20', 'vac_temp_20',
            'phase_30','heat_time_30', 'weight_30', 'head_temp_30', 'head_pressure_30', 'vac_temp_30',
            'phase_35','heat_time_35', 'weight_35', 'head_temp_35', 'head_pressure_35', 'vac_temp_35',
            'phase_40','heat_time_40', 'weight_40', 'head_temp_40', 'head_pressure_40', 'vac_temp_40'
            ]
label = ['ready']

X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], shuffle = False)

# X_test.to_csv('./data/reference/' + HEATER + '/X_test.csv')
# y_test.to_csv('./data/reference/' + HEATER + '/y_test.csv')

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                            n_estimators=1000,
                            max_depth=5,
                            learning_rate=0.1,
                            seed=42)

In [None]:
clf_xgb.fit(X_train,
            y_train,
            verbose=True,
            early_stopping_rounds=20,
            eval_metric='auc',
            eval_set=[(X_train, y_train),(X_test, y_test)])

In [None]:
pred = clf_xgb.predict_proba(X_test)[:,1]
thr = 0.7

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

sns.heatmap(confusion_matrix(y_test,pred>thr),annot=True,fmt='', cmap='Blues',cbar=False, xticklabels=['Not Ready','Ready'], yticklabels=['Not Ready','Ready'])

In [None]:
plt.plot(pd.Series(pred[-1000:]))

In [None]:
from sklearn.metrics import classification_report

print(HEATER,'DATASET=' + DATASET + '; MAX_HEAD_PRESSURE=' +  str(MAX_HEAD_PRESSURE) + '; MIN_HEAD_TEMPERATURE='+ str(MIN_HEAD_TEMPERATURE)+ '; MIN_WEIGHT=' + str(MIN_WEIGHT)+'; DELTA_TIME=' + str(DELTA_TIME)+'; DELTA_TEMP='+str(DELTA_TEMP),'Threshold= '+str(thr), 'features='+str(features),classification_report(y_test,pred>thr, target_names=['Not Ready','Ready']), sep='\n')
print(confusion_matrix(y_test,pred>thr),y_test.value_counts(), sep='\n')

In [None]:
X_test_60 = pd.read_csv('./data/reference/H160/X_test.csv')
y_test_60 = pd.read_csv('./data/reference/H160/y_test.csv')
X_test_60.drop(['Unnamed: 0'], axis=1, inplace=True)
y_test_60.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
print(HEATER, 'threshold='+str(thr))
print(classification_report(y_test_60, clf_xgb.predict_proba(X_test_60)[:,1]>thr, target_names=['Not Ready','Ready']), sep='\n')
print(confusion_matrix(y_test_60, clf_xgb.predict_proba(X_test_60)[:,1]>thr),y_test_60.value_counts(), sep='\n')

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plti

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, pred)
plti.plot(fpr, tpr)
plti.xlabel('False Positive Rate')
plti.ylabel('True Positive Rate')
plti.title('ROC Curve')
plti.show()