In [70]:
import sklearn
import datetime
import dataclasses
import numpy as np
import pandas as pd
from tqdm import tqdm
from pprint import pprint
from typing import List, Dict
from sklearn.preprocessing import normalize
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [123]:
df = pd.read_csv("ServiceNowEventLog.csv")
df.columns
# sorted(sklearn.metrics.SCORERS.keys())

Index(['number', 'incident_state', 'active', 'reassignment_count',
       'reopen_count', 'sys_mod_count', 'made_sla', 'caller_id', 'opened_by',
       'opened_at', 'sys_created_by', 'sys_created_at', 'sys_updated_by',
       'sys_updated_at', 'contact_type', 'location', 'category', 'subcategory',
       'u_symptom', 'cmdb_ci', 'impact', 'urgency', 'priority',
       'assignment_group', 'assigned_to', 'knowledge',
       'u_priority_confirmation', 'notify', 'problem_id', 'rfc', 'vendor',
       'caused_by', 'closed_code', 'resolved_by', 'resolved_at', 'closed_at'],
      dtype='object')

In [5]:
incident_states = set(df['incident_state'])
incident_states

{'-100',
 'Active',
 'Awaiting Evidence',
 'Awaiting Problem',
 'Awaiting User Info',
 'Awaiting Vendor',
 'Closed',
 'New',
 'Resolved'}

In [86]:
 sorted()

['Active',
 'Awaiting Evidence',
 'Awaiting Problem',
 'Awaiting User Info',
 'Awaiting Vendor',
 'Closed',
 'New',
 'Resolved']

## **Data Prepocessing**

In [None]:
# # numerical features.
# num_feats = [
#     'reassignment_count',
#     'reopen_count',
#     'sys_mod_count',
    
# ]
# # datetime features (ordinal).
# # compute timestamp difference between opened_at and ['sys_created_at', 'sys_updated_at', 'resolved_at']
# # ignore closed at.
# date_feats = [
#     'opened_at',
#     'sys_created_at',
#     'sys_updated_at',
#     'resolved_at' 
# ]
# # categorical features.
# # priority is calculated from impact and urgency
# categ_feats = [
#     'incident state', 
#     'active', 'made_sla',
#     'impact', 'urgency',
#     'priority', 'knowledge',
#     'u_priority_confirmation',
#     'notify', 
# ]

In [125]:
unique_incidents = set(df['number']) # Incident class
unique_openers = set(df['opened_by']) # User class
unique_victims = set(df['caller_id']) # User class
unique_locations = set(df['location']) # Location class
unique_updaters = set(df['sys_updated_by']) # User class 
unique_affected_items = set(df['cmdb_ci']) # Item class (report affected items)
unique_vendors = set(df['vendor']) # vendor in charge of incident (cat)
unique_support_groups = set(df['assignment_group']) # UserGroup class
unique_assignees = set(df['assigned_to']) # User class (user in charge of incident)
unique_problems = set(df['problem_id']) # Problem class (problem associated with id) 
# ignore: closed_at

# unique problems
# location
# vendor (cat)

# RFC class ()
print(f"{len(unique_incidents)} unique incidents")
# print(f"{len(unique_callers)} unique openers")
print(f"{len(unique_victims)} unique victims")
print(f"{len(unique_updaters)} unique updaters")
print(f"{len(unique_locations)} unique locations")
print(f"{len(unique_affected_items)} unique affected items")
print(f"{len(unique_vendors)} unique vendors")
print(f"{len(unique_support_groups)} unique support groups")
print(f"{len(unique_assignees)} unique report assignees")
print(f"{len(unique_problems)} unique problems")

24918 unique incidents
5245 unique victims
846 unique updaters
225 unique locations
51 unique affected items
5 unique vendors
79 unique support groups
235 unique report assignees
253 unique problems


In [122]:
# @dataclasses
# class User:
#     id: str
    
#     def __call__(self, **args) -> None:
#         pass

In [111]:
class ServiceNowDataset:
    def __init__(self, records: List[dict], 
                 id: str="number", target: str="resolved_at"):
        self.id = id
        self.target = target
        self.t_ref_key = 'opened_at'
        self.t_fmt_str = "%d/%m/%Y %H:%M"
        self.time_feats = [
            'sys_created_at',
            'sys_updated_at',
            'resolved_at' 
        ]
        self.num_feats = [
            'reassignment_count',
            'reopen_count',
            'sys_mod_count',

        ]
        self.categ_feats = [
            'incident_state', 
            'impact', 'urgency',
            'priority', 
        ]
        self.bool_feats = [
            'active', 'made_sla', 'knowledge',
            'u_priority_confirmation',  
        ]
        self.categ_map = {
            "incident_state": [
                 'Active', 
                 'Awaiting Evidence',
                 'Awaiting Problem',
                 'Awaiting User Info',
                 'Awaiting Vendor',
                 'Closed',
                 'New',
                 'Resolved'
            ],
            "urgency": [
                '1 - High', 
                '2 - Medium', 
                '3 - Low'
            ],
            "impact": [
                '1 - High', 
                '2 - Medium', 
                '3 - Low'
            ],
            "priority": [
                '1 - Critical', 
                '2 - High', 
                '3 - Moderate', 
                '4 - Low'
            ],
        }
        self.features = self.time_feats + self.num_feats
        self.data = self(records, train=True)
        self.raw_features = records[0].keys()
    
    def __str__(self):
        return str(self.data)
    
    def __repr__(self):
        return repr(self.data)
    
    def __call__(self, data: List[dict], train: bool=False):
        i = 0
        new_data = []
        for rec in tqdm(data, desc="processing data"):
            proc_rec = {}
            t_rec = self.convert_time_fields(rec)
            proc_rec.update(t_rec)
            for feat in self.num_feats:
                proc_rec[feat] = rec[feat]
            categ_rec = self.encode_categ_onehot(rec)
            bool_rec = self.encode_bool(rec)
            proc_rec.update(categ_rec)
            proc_rec.update(bool_rec)
            proc_rec["notify"] = [
                'Do Not Notify', 
                'Send Email',
            ].index(rec["notify"])
            proc_rec["id"] = rec["number"]
            new_data.append(proc_rec)
            i += 1
            # if i == 100: break
        return new_data
    
    def __getitem__(self, i: int):
        return self.data[i]
    
    def encode_categ_onehot(self, rec: dict):
        categ_rec = {}
        for feat in self.categ_feats:
            map_ = self.categ_map[feat]
            vec = [0 for i in range(len(map_))]
            try: vec[map_.index(rec[feat])] = 1
            except ValueError: pass
            categ_rec[feat] = vec
            
        return categ_rec
    
    def encode_bool(self, rec: dict):
        bool_rec = {}
        for feat in self.bool_feats:
            bool_rec[feat] = int(rec[feat])
        
        return bool_rec
        
    def convert_time_fields(self, rec: dict):
        t_ref = datetime.datetime.strptime(
            rec[self.t_ref_key], 
            self.t_fmt_str,
        ).timestamp()
        t_rec = {}
        for feat in self.time_feats:
            val = rec[feat]
            try:
                t = datetime.datetime.strptime(
                    val, self.t_fmt_str,
                ).timestamp()
                t_rec[feat] = t-t_ref
            except ValueError:
                t_rec[feat] = -1 # encode missing values as -1
        
        return t_rec
    
    def tolist(self):
        X, y = [], []
        for rec in tqdm(self.data, desc="converting to list"):
            rec_list = []
            for k, v in rec.items():
                if k == self.target: 
                    y.append(v)
                    continue
                # skip these fields.
                if isinstance(v, list):
                    rec_list += v
                elif isinstance(v, (float, int)):
                    rec_list += [v]
            X.append(rec_list)
        
        return X, y
        
    def aggregate(self, inplace=False):
        import copy
        cid = ""
        data = []
        for rec in tqdm(self.data, desc="aggregating"):
            id = rec["id"]
            if cid != id: 
                cid = id
                N = 1
                state = rec
            else:
                for feat in self.num_feats:
                    state[feat] = (N*state[feat]+rec[feat])/(N+1) 
                for feat in self.categ_feats:
                    state[feat] = (np.array(rec[feat]) + np.array(state[feat])).tolist()
                for feat in self.bool_feats+self.time_feats:
                    state[feat] = rec[feat]
                N += 1
            data.append(copy.deepcopy(state))
        if inplace: self.data = data
        else: return data
        
    def tonumpy(self):
        X, y = self.tolist()
        return np.array(X), np.array(y)
    
    def __len__(self):
        return len(self.data)

In [99]:
df = pd.read_csv("ServiceNowEventLog.csv")
df_records = df.to_dict("records")

In [113]:
data = ServiceNowDataset(df_records)
# print(data.raw_features)
# print(data.features)
# pprint(data.data)
print(len(data))

# sys created at info is missing in some cases.
missing_time = {'sys_created_at': 0, 'resolved_at': 0}
for i, rec in enumerate(data):
    
    for k in data.time_feats:
        if rec[k] == -1:
            missing_time[k] += 1
missing_time

processing data: 100%|██████████| 141712/141712 [00:12<00:00, 11246.49it/s]


141712


{'sys_created_at': 53076, 'resolved_at': 3141}

## Train Models

In [112]:
# print(len(set(i['id'] for i in data)))
# data[0]

In [14]:
X, y = data.tonumpy()

converting to list: 100%|██████████| 141712/141712 [00:01<00:00, 92655.72it/s] 


In [178]:
X[0:2], y[:10]

(array([[4.200e+02, 4.200e+02, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
         0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
         0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
         0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00,
         1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00],
        [4.200e+02, 2.742e+04, 0.000e+00, 0.000e+00, 2.000e+00, 0.000e+00,
         0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
         1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
         0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00,
         1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00]]),
 array([ 36780.,  36780.,  36780.,  36780., 105120., 105120., 105120.,
        105120., 105120., 105120.]))

### Normalize the data

In [15]:
X_norm = normalize(X, norm="l2")
X_norm[0]
y[0]

36780.0

In [29]:
# decision_tree = DecisionTreeRegressor(random_state=2022)

In [119]:
class Evaluator:
    def __init__(self, model_class, seeds=[2022], **args):
        self.models = {} 
        self.seeds = seeds
        for seed in self.seeds: 
            self.models[seed] = model_class(**args, random_state=seed)
    
    def evaluate(self, X, y, folds=5, 
                 scoring="r2", **args):
        all_seed_scores = []
        print("scoring:", scoring)
        pbar = tqdm(self.seeds, desc="eval(seed='NA')")
        for seed in pbar:
            pbar.set_description(f"eval(seed={seed})")
            all_seed_scores.append(cross_val_score(
                self.models[seed], X, y, 
                cv=folds, scoring=scoring, 
                **args,
            ))
        all_seed_scores = np.stack([scores.mean() for scores in all_seed_scores])
                                 
        return all_seed_scores, all_seed_scores.mean(), all_seed_scores.var()**0.5
    
seeds = [1,2,3,4,5,6,7,8,9,2022]
folds = 10

In [72]:
decision_tree = Evaluator(DecisionTreeRegressor, seeds=seeds)
scores, mu, sigma = decision_tree.evaluate(
    X_norm, y, n_jobs=4, 
    folds=folds, verbose=1,
    scoring='explained_variance'
)
# mean absolute error
# 'explained_variance'
# 'max_error'
# 'neg_mean_absolute_error'
# 'neg_mean_squared_error'
# 'neg_root_mean_squared_error'
# 'neg_mean_squared_log_error'
# 'neg_median_absolute_error'
# 'r2'
# 'neg_mean_poisson_deviance'
# 'neg_mean_gamma_deviance'
# 'neg_mean_absolute_percentage_error'
print(f"R²={mu:.3f}±{sigma:.3f} ({folds} folds, {len(seeds)} seeds)")

eval(seed=1):   0%|          | 0/10 [00:00<?, ?it/s]   [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   18.4s finished
eval(seed=2):  10%|█         | 1/10 [00:18<02:46, 18.54s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   12.5s finished
eval(seed=3):  20%|██        | 2/10 [00:31<02:00, 15.08s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   12.4s finished
eval(seed=4):  30%|███       | 3/10 [00:43<01:37, 13.92s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   12.3s finished
eval(seed=5):  40%|████      | 4/10 [00:56<01:19, 13.33s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapse

R²=0.055±0.007 (10 folds, 10 seeds)





In [73]:
random_forest = Evaluator(RandomForestRegressor, seeds=seeds, max_depth=2)
scores, mu, sigma = random_forest.evaluate(
    X_norm, y, n_jobs=4, 
    folds=folds, verbose=1,
    scoring='r2'
)
print(f"R²={mu:.3f}±{sigma:.3f} ({folds} folds, {len(seeds)} seeds)")

eval(seed=1):   0%|          | 0/10 [00:00<?, ?it/s]   [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.5min finished
eval(seed=2):  10%|█         | 1/10 [01:32<13:48, 92.03s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.5min finished
eval(seed=3):  20%|██        | 2/10 [03:00<12:01, 90.18s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.5min finished
eval(seed=4):  30%|███       | 3/10 [04:28<10:24, 89.20s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.5min finished
eval(seed=5):  40%|████      | 4/10 [05:57<08:53, 88.84s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapse

R²=0.045±0.000 (10 folds, 10 seeds)





In [65]:
sigma

7.241055344541257e-05

In [34]:
folds = 10
tree_scores = []
tree_scores = cross_val_score(
    decision_tree, X_norm, y, 
    cv=folds, verbose=1, n_jobs=4
).tolist()
print(f"mean R² over {folds} folds={scores.mean():.3f}")

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


mean R² over 10 folds=0.051


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   12.4s finished


In [35]:
random_forest = RandomForestRegressor(
    max_depth=2, random_state=2022
)
scores = cross_val_score(
    random_forest, X_norm, y, 
    cv=folds, verbose=1, n_jobs=4
)
print(f"mean R² over {folds} folds={scores.mean():.3f}")

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


mean R² over 10 folds=0.045


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.4min finished


In [114]:
data.aggregate(inplace=True)
X, y = data.tonumpy()
X_norm = normalize(X, norm="l2")

aggregating: 100%|██████████| 141712/141712 [00:10<00:00, 13785.61it/s]
converting to list: 100%|██████████| 141712/141712 [00:01<00:00, 104026.67it/s]


In [120]:
decision_tree = Evaluator(DecisionTreeRegressor, seeds=seeds)
scores, mu, sigma = decision_tree.evaluate(
    X_norm, y, n_jobs=4, 
    folds=folds, verbose=1,
    scoring='explained_variance'
)
print(f"R²={mu:.3f}±{sigma:.3f} ({folds} folds, {len(seeds)} seeds)")

eval(seed=1):   0%|          | 0/10 [00:00<?, ?it/s]   

scoring: explained_variance


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   36.8s finished
eval(seed=2):  10%|█         | 1/10 [00:36<05:32, 36.90s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   16.8s finished
eval(seed=3):  20%|██        | 2/10 [00:53<03:21, 25.14s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   16.0s finished
eval(seed=4):  30%|███       | 3/10 [01:09<02:27, 21.03s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   16.6s finished
eval(seed=5):  40%|████      | 4/10 [01:26<01:55, 19.33s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   16.8s finished
eval(seed=6):  50%|█████     | 5/10

R²=0.028±0.004 (10 folds, 10 seeds)





In [121]:
random_forest = Evaluator(RandomForestRegressor, seeds=seeds, max_depth=2)
scores, mu, sigma = random_forest.evaluate(
    X_norm, y, n_jobs=4, 
    folds=folds, verbose=1,
    scoring='explained_variance'
)
print(f"R²={mu:.3f}±{sigma:.3f} ({folds} folds, {len(seeds)} seeds)")

eval(seed=1):   0%|          | 0/10 [00:00<?, ?it/s]   

scoring: explained_variance


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.8min finished
eval(seed=2):  10%|█         | 1/10 [01:47<16:11, 107.94s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.8min finished
eval(seed=3):  20%|██        | 2/10 [03:35<14:21, 107.64s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.8min finished
eval(seed=4):  30%|███       | 3/10 [05:22<12:32, 107.49s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.8min finished
eval(seed=5):  40%|████      | 4/10 [07:11<10:47, 107.99s/it][Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  1.8min finished
eval(seed=6):  50%|█████     | 

R²=0.049±0.000 (10 folds, 10 seeds)



