In [159]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [160]:
vm_table_data = pd.read_csv("vm_table_data_azure.csv")
vm_table_data.head()

Unnamed: 0,serial,vm_id,subscription_id,deployment_id,vm_creation_timestamp,vm_deletion_timestamp,max_cpu,avg_cpu,p95_max_cpu,vm_category,vm_core_count,vm_memory
0,0,rKggHO/04j31UFy65mDTwtjdMQL/G03xWfl3xGeiilB4/W...,ub4ty8ygwOECrIz7eaZ/9hDwnCsERvZ3nJJ03sDSpD85et...,+ZraIDUNaWYDZMBiBtZm7xSjr+j3zcHGjup1+wyKxHFmyJ...,424500,425400,37.879261,3.325358,37.879261,Unknown,4,32
1,1,YrR8gPtBmfNaOdnNEW5If1SdTqQgGQHEnLHGPjySt53bKW...,9LrdYRcUfGbmL2fFfLR/JUg2OTkjGRe3iluwIhDRPnPDPa...,GEyIElfPSFupze8T+T1niQMepeqG88VpLNuxUMyIDbz8VF...,1133100,1133700,0.304368,0.220553,0.304368,Unknown,4,32
2,2,xzQ++JF1UAkh70CDhmzkiOo+DQn+E2TLErCFKEmSswv1pl...,0XnZZ8sMN5HY+Yg+0dykYB5oenlgsrCpzpgFSvn/MX42Ze...,7aCQS6fPUw9rwCPiqvghk/WCEbMV3KgNJjA+sssdfY5Ybl...,0,2591400,98.573424,30.340054,98.212503,Interactive,2,4
3,3,vZEivnhabRmImDr+JqKqZnpIM3WxtypwoxjfjnklR/idyR...,HUGaZ+piPP4eHjycCBki2yq0raJywdzrVuriR6nQceH3hA...,/s/D5VtTQDxyS6wq7N/VQAMczx61Ny1Ut3a3iFmDSOCXxp...,228300,229800,82.581449,13.876299,82.581449,Unknown,2,4
4,4,MqvcZ6Au5ouI6if56MJHmoSqHtX8oRv0dPkaxCId3aUcr1...,p14cXGYqCKCcF7b7OdV6bdr/0gCim+u1LeqKoyEkyNNMWf...,ZFCk80sIQzr43FUSqy2DOrcvBhuQkyfVz7gus8SORhyBxC...,1395600,1397700,0.097875,0.035215,0.097875,Unknown,4,32


In [162]:
vm_table_data["vm_creation_hour_of_day"] = (vm_table_data["vm_creation_timestamp"] / 3600) % 24
vm_table_data["total_time_running"] = vm_table_data["vm_deletion_timestamp"] - vm_table_data["vm_creation_timestamp"]
vm_table_data.head()

Unnamed: 0,serial,vm_id,subscription_id,deployment_id,vm_creation_timestamp,vm_deletion_timestamp,max_cpu,avg_cpu,p95_max_cpu,vm_category,vm_core_count,vm_memory,vm_creation_hour_of_day,total_time_running
0,0,rKggHO/04j31UFy65mDTwtjdMQL/G03xWfl3xGeiilB4/W...,ub4ty8ygwOECrIz7eaZ/9hDwnCsERvZ3nJJ03sDSpD85et...,+ZraIDUNaWYDZMBiBtZm7xSjr+j3zcHGjup1+wyKxHFmyJ...,424500,425400,37.879261,3.325358,37.879261,Unknown,4,32,21.916667,900
1,1,YrR8gPtBmfNaOdnNEW5If1SdTqQgGQHEnLHGPjySt53bKW...,9LrdYRcUfGbmL2fFfLR/JUg2OTkjGRe3iluwIhDRPnPDPa...,GEyIElfPSFupze8T+T1niQMepeqG88VpLNuxUMyIDbz8VF...,1133100,1133700,0.304368,0.220553,0.304368,Unknown,4,32,2.75,600
2,2,xzQ++JF1UAkh70CDhmzkiOo+DQn+E2TLErCFKEmSswv1pl...,0XnZZ8sMN5HY+Yg+0dykYB5oenlgsrCpzpgFSvn/MX42Ze...,7aCQS6fPUw9rwCPiqvghk/WCEbMV3KgNJjA+sssdfY5Ybl...,0,2591400,98.573424,30.340054,98.212503,Interactive,2,4,0.0,2591400
3,3,vZEivnhabRmImDr+JqKqZnpIM3WxtypwoxjfjnklR/idyR...,HUGaZ+piPP4eHjycCBki2yq0raJywdzrVuriR6nQceH3hA...,/s/D5VtTQDxyS6wq7N/VQAMczx61Ny1Ut3a3iFmDSOCXxp...,228300,229800,82.581449,13.876299,82.581449,Unknown,2,4,15.416667,1500
4,4,MqvcZ6Au5ouI6if56MJHmoSqHtX8oRv0dPkaxCId3aUcr1...,p14cXGYqCKCcF7b7OdV6bdr/0gCim+u1LeqKoyEkyNNMWf...,ZFCk80sIQzr43FUSqy2DOrcvBhuQkyfVz7gus8SORhyBxC...,1395600,1397700,0.097875,0.035215,0.097875,Unknown,4,32,3.666667,2100


In [163]:
def clean_metrics_with_initial_signs(metric_value):
    if metric_value[0] == ">" or metric_value[0] == "<":
        return metric_value[1 : ]
    else:
        return metric_value

vm_table_data["vm_core_count"] = vm_table_data["vm_core_count"].apply(clean_metrics_with_initial_signs)
vm_table_data["vm_core_count"] = vm_table_data["vm_core_count"].astype("float")

In [164]:
vm_table_data["vm_memory"] = vm_table_data["vm_memory"].apply(clean_metrics_with_initial_signs)
vm_table_data["vm_memory"] = vm_table_data["vm_memory"].astype("float")
vm_table_data["vm_category"] = vm_table_data["vm_category"].astype("category")

In [None]:
training_data_X, testing_data_X, training_data_Y, testing_data_Y = train_test_split(vm_table_data[["vm_creation_hour_of_day", "total_time_running", "vm_core_count", "vm_memory", "vm_category"]], vm_table_data["avg_cpu"], test_size = 0.05)

In [180]:
xgboost_regressor_model = XGBRegressor(n_estimators = 1500, enable_categorical = True)

In [None]:
xgboost_regressor_model.fit(training_data_X, training_data_Y)
avg_cpu_prediction_values = xgboost_regressor_model.predict(testing_data_X)

In [None]:
root_mean_squared_error(testing_data_Y, avg_cpu_prediction_values)

14.832243010838491

In [200]:
diff_in_prediction_vals_from_truth = (abs(avg_cpu_prediction_values - testing_data_Y)).to_list()
prediction_in_range_counter = 0
for curr_diff in diff_in_prediction_vals_from_truth:
    if curr_diff <= 10:
        prediction_in_range_counter = prediction_in_range_counter + 1
model_avg_cpu_pred_accuracy = prediction_in_range_counter * 100 / len(diff_in_prediction_vals_from_truth)
print("Model's Average CPU Utilization Precition accuracy is:", str(model_avg_cpu_pred_accuracy) + "%")

Model's Average CPU Utilization Precition accuracy is: 64.75611746724243%
