In [70]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics


%matplotlib inline

In [71]:
#Load data into dataframe
filepath = 'assets/nifi_prometheus_metrics.csv'
nifi_metrics = pd.read_csv(filepath)

# Remove unused id and clean up column names 
nifi_metrics = nifi_metrics.drop('id',axis=1)
updated_columns = [labels.split(':')[0] for labels in nifi_metrics.columns.values]
updated_columns = np.array(updated_columns)
nifi_metrics.columns=updated_columns
nifi_metrics = nifi_metrics.set_index('time')
nifi_metrics = nifi_metrics.drop(['timestamp'],axis=1)

#Add Datetime as a feature (did not get to do this in part 1)
#convert index to datetime
nifi_metrics.index = pd.to_datetime(nifi_metrics.index)

#convert datetime to features columns
nifi_metrics.loc[:,'hour_of_day'] = nifi_metrics.index.hour 
nifi_metrics.loc[:,'day_of_week'] = nifi_metrics.index.weekday_name

#Confirm Shape
nifi_metrics.shape

(3849, 26)

In [72]:
nifi_metrics.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3849 entries, 2019-07-20 18:15:00 to 2019-07-27 17:00:00
Data columns (total 26 columns):
ActiveThreads                    3849 non-null float64
BytesQueued                      3849 non-null float64
BytesReadLast5Minutes            3849 non-null float64
BytesReceivedLast5Minutes        3849 non-null float64
BytesSentLast5Minutes            3849 non-null float64
BytesWrittenLast5Minutes         3849 non-null float64
FlowFilesQueued                  3849 non-null float64
FlowFilesReceivedLast5Minutes    3849 non-null float64
FlowFilesSentLast5Minutes        3849 non-null float64
TotalTaskDurationNanoSeconds     3849 non-null float64
TotalTaskDurationSeconds         3849 non-null float64
availableCores                   3849 non-null float64
jvmdaemon_thread_count           3849 non-null float64
jvmfile_descriptor_usage         3849 non-null float64
jvmheap_usage                    3849 non-null float64
jvmheap_used                    

Based on my results from EDA I want to attempt the following

1) Try non-linear models with Linear Regression

2) Perform cross-validation with KFold for training and test splits

3) Break out hour of day feature into dummy variables and assess (via correlation) if those are worth including

4) Attempt Random Forest Regressor for modeling

In [73]:
max_depth_range = range(1,21)

#Create features and target columns
feature_cols = ['ActiveThreads', 'BytesReceivedLast5Minutes', 'jvmthread_count', 'loadAverage1min']
target_col = 'jvmheap_used'

# Create feature and target dataframes
X = nifi_metrics.loc[:, feature_cols]
y = nifi_metrics.loc[:, target_col]


In [74]:
from sklearn.metrics.scorer import make_scorer

def neg_rmse(y_true, y_pred):
    mse = np.square(y_true - y_pred).mean()
    return -np.sqrt(mse)

#negative because higher is bettter which is what GridSearch expects
neg_rmse = make_scorer(neg_rmse) 

In [75]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures

pf = PolynomialFeatures(interaction_only=True, include_bias=False)
pf.fit(X)
X_poly = pf.transform(X) 

kf = KFold(5, shuffle=True)
lr_grid = GridSearchCV(estimator=LinearRegression(), param_grid={}, cv=kf, return_train_score=True, scoring=neg_rmse)
lr_grid.fit(X_poly,y)

results = lr_grid.cv_results_
print("Training RMSE Heap Used in Megabytes:", results['mean_train_score'] *.000001)
print("Test RMSE Heap Used in Megabytes:",  results['mean_test_score'] * .000001)

Training RMSE Heap Used in Megabytes: [-59.22573848]
Test RMSE Heap Used in Megabytes: [-59.62779418]


In [77]:
from sklearn.ensemble import RandomForestRegressor

feature_cols = ['ActiveThreads', 'BytesReceivedLast5Minutes', 'jvmthread_count', 'loadAverage1min']
target_col = 'jvmheap_used'

# Create feature and target dataframes
X = nifi_metrics.loc[:, feature_cols]
y = nifi_metrics.loc[:, target_col]

rf = RandomForestRegressor(n_estimators = 100, random_state = 1)
kf = KFold(10, shuffle=True)
rf_grid = GridSearchCV(estimator=rf, param_grid={'max_depth': max_depth_range}, cv=kf, return_train_score=True, scoring=neg_rmse)
rf_grid.fit(X_poly,y)
results = rf_grid.cv_results_
print("Training RMSE Heap Used in Megabytes:", results['mean_train_score'] *.000001)
print("Test RMSE Heap Used in Megabytes:",  results['mean_test_score'] * .000001)



Training RMSE Heap Used in Megabytes: [-72.81212163 -59.3218767  -56.22830041 -53.70257416 -52.33567266
 -50.91937849 -49.5451993  -48.11579355 -46.60496855 -45.02152771
 -43.36682747 -41.64662426 -39.87937131 -38.08190155 -36.30876127
 -34.5820574  -32.90909369 -31.33582418 -29.8937224  -28.58314063]
Test RMSE Heap Used in Megabytes: [-72.97515269 -59.83731214 -57.11737073 -55.28535474 -54.93851037
 -54.83616557 -54.77755662 -54.80696964 -54.90962024 -54.925478
 -55.06431348 -55.21645367 -55.33268485 -55.49255281 -55.62035468
 -55.7903402  -55.96029017 -56.17822334 -56.31876997 -56.52389304]


In [45]:
#Let see if we have further improvements with hour of day, day of week
nifi_metrics = pd.get_dummies(nifi_metrics,columns=['hour_of_day'], drop_first=True)
time_cols = [col for col in nifi_metrics.columns if col.startswith(('hour'))]

nifi_metrics = pd.get_dummies(nifi_metrics,columns=['day_of_week'], drop_first=True)
day_cols = [col for col in day_metrics.columns if col.startswith(('day_of_week'))]


In [61]:
time_feature_cols = time_cols + feature_cols
target_col = 'jvmheap_used'

# Create feature and target dataframes
X = nifi_metrics.loc[:, time_feature_cols]
y = nifi_metrics.loc[:, target_col]

pf = PolynomialFeatures(interaction_only=True, include_bias=False)
pf.fit(X)
X_poly = pf.transform(X) 

kf = KFold(5, shuffle=True)
lr_grid = GridSearchCV(estimator=LinearRegression(), param_grid={}, cv=kf, return_train_score=True, scoring=neg_rmse)
lr_grid.fit(X_poly,y)

results = lr_grid.cv_results_
print("Training RMSE Heap Used in Megabytes:", results['mean_train_score'] *.000001)
print("Test RMSE Heap Used in Megabytes:",  results['mean_test_score'] * .000001)

Training RMSE Heap Used in Megabytes: [54.59220434]
Test RMSE Heap Used in Megabytes: [57.95049631]


In [59]:
time_feature_cols = time_cols + feature_cols
target_col = 'jvmheap_used'

# Create feature and target dataframes
X = nifi_metrics.loc[:, time_feature_cols]
y = nifi_metrics.loc[:, target_col]

rf = RandomForestRegressor(n_estimators = 100, random_state = 1)
kf = KFold(10, shuffle=True)
rf_grid = GridSearchCV(estimator=rf, param_grid={'max_depth': max_depth_range}, cv=kf, return_train_score=True, scoring=neg_rmse)
rf_grid.fit(X_poly,y)
results = rf_grid.cv_results_
print("Training RMSE Heap Used in Megabytes:", results['mean_train_score'] *.000001)
print("Test RMSE Heap Used in Megabytes:",  results['mean_test_score'] * .000001)


Training RMSE Heap Used in Megabytes: [20.53579628]
Test RMSE Heap Used in Megabytes: [54.74764017]
