In [2]:
# Import data and libraries
import pandas as pd

data = pd.read_csv('data/root_cause_analysis.csv', index_col='ID')
data.head()

Unnamed: 0_level_0,CPU_LOAD,MEMORY_LOAD,DELAY,ERROR_1000,ERROR_1001,ERROR_1002,ERROR_1003,ROOT_CAUSE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,0,0,0,1,0,1,MEMORY
2,0,0,0,0,0,0,1,MEMORY
3,0,1,1,0,0,1,1,MEMORY
4,0,1,0,1,1,0,1,MEMORY
5,1,1,0,1,0,1,0,NETWORK_DELAY


In [4]:
# Check the missing value
data.info()


# Check the descriptive analysis
data.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 1 to 1000
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CPU_LOAD     1000 non-null   int64 
 1   MEMORY_LOAD  1000 non-null   int64 
 2   DELAY        1000 non-null   int64 
 3   ERROR_1000   1000 non-null   int64 
 4   ERROR_1001   1000 non-null   int64 
 5   ERROR_1002   1000 non-null   int64 
 6   ERROR_1003   1000 non-null   int64 
 7   ROOT_CAUSE   1000 non-null   object
dtypes: int64(7), object(1)
memory usage: 70.3+ KB


Unnamed: 0,CPU_LOAD,MEMORY_LOAD,DELAY,ERROR_1000,ERROR_1001,ERROR_1002,ERROR_1003
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.472,0.418,0.394,0.395,0.485,0.432,0.381
std,0.499465,0.493477,0.488879,0.489095,0.500025,0.495602,0.485876
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Set the Feature and Target
X = data.iloc[:, :7]
y = data.iloc[:, -1]

In [8]:
# Importing classifier libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV


# Setting the random state
SEED = 42

In [10]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED,
                                                    test_size=0.33,
                                                    stratify=y)

In [23]:
# Building DecisionTree Model
dt = DecisionTreeClassifier(random_state=SEED)


# Setting model parameters grid
grid = {'max_depth':[5, 10, 15],
        'min_samples_leaf':[10, 20, 40, 70],
        'min_samples_split':[4, 7, 10, 13]}

In [24]:
# Performing Cross Validation for Decision Tree
gs = GridSearchCV(estimator=dt,
                    param_grid=grid,
                    scoring='accuracy',
                    n_jobs=-1)


# Fitting the model
gs.fit(X_train, y_train)

In [25]:
# Selecting the best estimator
best_model = gs.best_estimator_
best_parameters = gs.best_params_

print('The best hyperparameters is', best_parameters)


# Measuring the accuracy of the model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print('The accuracy of Decision Tree model is', accuracy)
print('The confusion matrix is shown bellow \n', conf_matrix)
print('The complete classification report \n', report)

The best hyperparameters is {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 4}
The accuracy of Decision Tree model is 0.7909090909090909
The confusion matrix is shown bellow 
 [[92 15  5]
 [19 83  5]
 [22  3 86]]
The complete classification report 
                 precision    recall  f1-score   support

DATABASE_ISSUE       0.69      0.82      0.75       112
        MEMORY       0.82      0.78      0.80       107
 NETWORK_DELAY       0.90      0.77      0.83       111

      accuracy                           0.79       330
     macro avg       0.80      0.79      0.79       330
  weighted avg       0.80      0.79      0.79       330



In [26]:
# Building Random Forest model
rf = RandomForestClassifier(random_state=SEED)


# Setting the parameters grid
grid = {'n_estimators':[150, 200, 300],
        'max_depth':[5, 10, 15],
        'min_samples_leaf':[10, 20, 40, 70],
        'min_samples_split':[4, 7, 10, 13]}


# Performing Grid Search Cross Validation
gs = GridSearchCV(estimator=rf,
                  param_grid=grid,
                  scoring='accuracy',
                  n_jobs=-1)


# fitting the model
gs.fit(X_train, y_train)

In [28]:
# Extracting the best model
best_model = gs.best_estimator_
best_parameters = gs.best_params_


print('The best hyperparameters is', best_parameters)


# Measuring the accuracy of the model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print('The accuracy of Random Forest model is', accuracy)
print('The confusion matrix is shown bellow \n', conf_matrix)
print('The complete classification report \n', report)

The best hyperparameters is {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 4, 'n_estimators': 150}
The accuracy of Random Forest model is 0.806060606060606
The confusion matrix is shown bellow 
 [[92 12  8]
 [14 86  7]
 [15  8 88]]
The complete classification report 
                 precision    recall  f1-score   support

DATABASE_ISSUE       0.76      0.82      0.79       112
        MEMORY       0.81      0.80      0.81       107
 NETWORK_DELAY       0.85      0.79      0.82       111

      accuracy                           0.81       330
     macro avg       0.81      0.81      0.81       330
  weighted avg       0.81      0.81      0.81       330



In [35]:
new_data = {'CPU_LOAD':[0,0,1], 'MEMORY_LOAD':[0,1,1], 'DELAY':[0,0,0], 'ERROR_1000':[0,1,1],
            'ERROR_1001':[0,1,0], 'ERROR_1002':[1,1,0], 'ERROR_1003':[1,0,0]}

new_data = pd.DataFrame(new_data)
new_data


best_model.predict(new_data)

array(['MEMORY', 'DATABASE_ISSUE', 'DATABASE_ISSUE'], dtype=object)

In [40]:
from sklearn.svm import SVC

svc = SVC()


grid = {'C':[1, 10, 100, 1000],
        'gamma':[1, 0.1, 0.01, 0.001]}

gs = GridSearchCV(estimator=svc,
                  param_grid=grid,
                  scoring='accuracy',
                  n_jobs=-1)


gs.fit(X_train, y_train)

In [41]:
best_model = gs.best_estimator_
best_parameters = gs.best_params_


print('The best hyperparameters is', best_parameters)


# Measuring the accuracy of the model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print('The accuracy of Random Forest model is', accuracy)
print('The confusion matrix is shown bellow \n', conf_matrix)
print('The complete classification report \n', report)

The best hyperparameters is {'C': 1, 'gamma': 0.01}
The accuracy of Random Forest model is 0.7757575757575758
The confusion matrix is shown bellow 
 [[91 15  6]
 [19 85  3]
 [18 13 80]]
The complete classification report 
                 precision    recall  f1-score   support

DATABASE_ISSUE       0.71      0.81      0.76       112
        MEMORY       0.75      0.79      0.77       107
 NETWORK_DELAY       0.90      0.72      0.80       111

      accuracy                           0.78       330
     macro avg       0.79      0.78      0.78       330
  weighted avg       0.79      0.78      0.78       330

