In [2]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# load the data
data = pd.read_csv('data/root_cause_analysis.csv', index_col='ID')
data.head()

Unnamed: 0_level_0,CPU_LOAD,MEMORY_LOAD,DELAY,ERROR_1000,ERROR_1001,ERROR_1002,ERROR_1003,ROOT_CAUSE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,0,0,0,1,0,1,MEMORY
2,0,0,0,0,0,0,1,MEMORY
3,0,1,1,0,0,1,1,MEMORY
4,0,1,0,1,1,0,1,MEMORY
5,1,1,0,1,0,1,0,NETWORK_DELAY


In [3]:
# Assign Target and Feature
X = data.iloc[:, :7]
y = data.iloc[:, -1]

In [4]:
# Import machine learning libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


SEED = 42


# Split data test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= SEED,
                                                    test_size=0.3, stratify=y)

In [8]:
# Initiate the model
dt = DecisionTreeClassifier(random_state=SEED)


# Make parameters grid
grid = {'max_depth':[10, 100, 1000, 10000],
        'min_samples_split':[5, 10, 15, 20],
        'min_samples_leaf':[5, 15, 30, 60]}


# Performing GridSearch
gs = GridSearchCV(estimator=dt,
                  param_grid=grid,
                  scoring='accuracy',
                  n_jobs=-1)


# Fitting model
gs.fit(X_train, y_train)

In [9]:
# selecting the best model
best_model = gs.best_estimator_


# selecting the best parameters
best_params = gs.best_params_
print('The best parameters combination is:', best_params)

The best parameters combination is: {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 5}


In [13]:
# Making prediction
y_pred = best_model.predict(X_test)


# Measure model accuracy
accuracy = accuracy_score(y_test, y_pred)
print('The accuracy of the model is:', accuracy)

# Provide classification report
report = classification_report(y_test, y_pred)
print('The Classification Report \n', report)

# Check the confusion matrix
cf = confusion_matrix(y_test, y_pred)
print(cf)

The accuracy of the model is: 0.8033333333333333
The Classification Report 
                 precision    recall  f1-score   support

DATABASE_ISSUE       0.73      0.83      0.78       102
        MEMORY       0.80      0.85      0.82        97
 NETWORK_DELAY       0.90      0.73      0.81       101

      accuracy                           0.80       300
     macro avg       0.81      0.80      0.80       300
  weighted avg       0.81      0.80      0.80       300

[[85 13  4]
 [11 82  4]
 [20  7 74]]
