In [1]:
import pandas as pd
import seaborn as sns
from pandas_profiling import ProfileReport
import datetime
import numpy as np
from scipy import stats

In [2]:
import pickle
# Function to serialise object
def save_pickle(file_nm, obj):
    pickle_out = open(file_nm, 'wb')
    pickle.dump(obj, pickle_out)
    pickle_out.close()

# Function to load object from pickle file
def load_pickle(file_nm):
    pickle_in = open(file_nm, 'rb')
    obj = pickle.load(pickle_in)
    return obj

In [48]:
src_df = pd.read_csv('incident_event_log.csv')

In [49]:
src_df.head()

Unnamed: 0,number,incident_state,active,reassignment_count,reopen_count,sys_mod_count,made_sla,caller_id,opened_by,opened_at,...,u_priority_confirmation,notify,problem_id,rfc,vendor,caused_by,closed_code,resolved_by,resolved_at,closed_at
0,INC0000045,New,True,0,0,0,True,Caller 2403,Opened by 8,29/2/2016 01:16,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 149,29/2/2016 11:29,5/3/2016 12:00
1,INC0000045,Resolved,True,0,0,2,True,Caller 2403,Opened by 8,29/2/2016 01:16,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 149,29/2/2016 11:29,5/3/2016 12:00
2,INC0000045,Resolved,True,0,0,3,True,Caller 2403,Opened by 8,29/2/2016 01:16,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 149,29/2/2016 11:29,5/3/2016 12:00
3,INC0000045,Closed,False,0,0,4,True,Caller 2403,Opened by 8,29/2/2016 01:16,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 149,29/2/2016 11:29,5/3/2016 12:00
4,INC0000047,New,True,0,0,0,True,Caller 2403,Opened by 397,29/2/2016 04:40,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 81,1/3/2016 09:52,6/3/2016 10:00


In [170]:
# Convert all date fields to date time
datetime_vars_ls = ['opened_at', 'sys_created_at', 'sys_updated_at', 'resolved_at', 'closed_at']

for dt_var in datetime_vars_ls:
    src_df[dt_var] = [pd.to_datetime(dt, format='%d/%m/%Y %H:%M', errors='coerce') for dt in src_df[dt_var]]

In [51]:
src_df.head()

Unnamed: 0,number,incident_state,active,reassignment_count,reopen_count,sys_mod_count,made_sla,caller_id,opened_by,opened_at,...,u_priority_confirmation,notify,problem_id,rfc,vendor,caused_by,closed_code,resolved_by,resolved_at,closed_at
0,INC0000045,New,True,0,0,0,True,Caller 2403,Opened by 8,2016-02-29 01:16:00,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 149,2016-02-29 11:29:00,2016-03-05 12:00:00
1,INC0000045,Resolved,True,0,0,2,True,Caller 2403,Opened by 8,2016-02-29 01:16:00,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 149,2016-02-29 11:29:00,2016-03-05 12:00:00
2,INC0000045,Resolved,True,0,0,3,True,Caller 2403,Opened by 8,2016-02-29 01:16:00,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 149,2016-02-29 11:29:00,2016-03-05 12:00:00
3,INC0000045,Closed,False,0,0,4,True,Caller 2403,Opened by 8,2016-02-29 01:16:00,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 149,2016-02-29 11:29:00,2016-03-05 12:00:00
4,INC0000047,New,True,0,0,0,True,Caller 2403,Opened by 397,2016-02-29 04:40:00,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 81,2016-03-01 09:52:00,2016-03-06 10:00:00


### Initial data anlaysis - per incident state granularity

In [52]:
src_df.columns

Index(['number', 'incident_state', 'active', 'reassignment_count',
       'reopen_count', 'sys_mod_count', 'made_sla', 'caller_id', 'opened_by',
       'opened_at', 'sys_created_by', 'sys_created_at', 'sys_updated_by',
       'sys_updated_at', 'contact_type', 'location', 'category', 'subcategory',
       'u_symptom', 'cmdb_ci', 'impact', 'urgency', 'priority',
       'assignment_group', 'assigned_to', 'knowledge',
       'u_priority_confirmation', 'notify', 'problem_id', 'rfc', 'vendor',
       'caused_by', 'closed_code', 'resolved_by', 'resolved_at', 'closed_at'],
      dtype='object')

In [63]:
# Obtain the first chronological incident state of all incidents
# Sort the data frame by opend_at, followed by sys_created_at chronologically
src_by_date_df = src_df.sort_values(by=['opened_at', 'sys_created_at'])
# Extract the earliest incident state for each incident number
first_state_dct_ls = [{'number': number, 'first_state': src_by_date_df[src_by_date_df.number == number]['incident_state'].head(1)} for number in src_by_date_df.number.drop_duplicates()]
# Create extract into Data Frame
first_state_df = pd.DataFrame(first_state_dct_ls)
save_pickle('first_state_df.pickle')

In [78]:
first_state_df = load_pickle('first_state_df.pickle')
for i in range(0, first_state_df.shape[0]):
    first_state_df['first_state'].iloc[i] = first_state_df['first_state'].iloc[i].iloc[0]

In [85]:
first_state_summary_df = first_state_df.groupby('first_state').count()
first_state_summary_df

Unnamed: 0_level_0,number
first_state,Unnamed: 1_level_1
Active,6503
Awaiting Problem,6
Awaiting User Info,193
New,16397
Resolved,1819


Noted data quality issue: not all incidents start with "New" incident state.  This included where the first entry is already "Resolved".  We need to remove incidents which is not complete.

In [128]:
# Filter for only the new ticket creation events to ensure we do not include unseen information when training
# Note: for each 'New' incident observations, it includes the outcome of the resolution (i.e. the prediction target) already
src_df = src_df[src_df.incident_state == 'New']
print('Population of incidents and incident_state: {}'.format(src_df[['number', 'incident_state']].drop_duplicates().shape[0]))

Population of incidents and incident_state: 16397


In [129]:
src_incident_observations_cnt_df = src_df.groupby('number').count()['incident_state']
src_incident_observations_cnt_df = src_incident_observations_cnt_df.reset_index(drop=False)
src_incident_observations_cnt_df.groupby('incident_state').count()

Unnamed: 0_level_0,number
incident_state,Unnamed: 1_level_1
1,7613
2,4111
3,2034
4,1073
5,658
6,374
7,205
8,128
9,74
10,51


Noted data quality issue: all incidents are created multiple times (up to 23 times).  Therefore there is a need to extract only one "New" observation.

In [137]:
# Create a new data frame where incidents state as New and extract the most recent entry
# Sort chronologically (descending) to get the last incident creation entry
new_state_src_df = src_df.sort_values(by=['opened_at', 'sys_created_at'], ascending=False)
# Extract the last observation with the "New" incident state
last_state_dct_ls = [{'number': number, 'row': new_state_src_df[new_state_src_df.number == number].head(1)} for number in new_state_src_df.number.drop_duplicates()]

In [138]:
save_pickle('last_state_dct_ls.pickle', last_state_dct_ls)
last_state_dct_ls = load_pickle('last_state_dct_ls.pickle')

# Create a new data frame based on the most recent "New" incident state entries
new_state_src_df = pd.concat([dct.get('row') for dct in last_state_dct_ls], axis=0)
save_pickle('new_state_src_df.pickle', new_state_src_df)

In [139]:
new_state_src_df = load_pickle('new_state_src_df.pickle')
new_state_src_df.head()

Unnamed: 0,number,incident_state,active,reassignment_count,reopen_count,sys_mod_count,made_sla,caller_id,opened_by,opened_at,...,u_priority_confirmation,notify,problem_id,rfc,vendor,caused_by,closed_code,resolved_by,resolved_at,closed_at
141698,INC0120495,New,True,0,0,0,True,Caller 323,Opened by 14,2017-02-15 11:58:00,...,False,Do Not Notify,?,?,?,?,?,?,NaT,2017-02-16 09:51:00
141696,INC0120319,New,True,0,0,0,True,Caller 1899,Opened by 508,2017-02-15 07:09:00,...,False,Do Not Notify,?,?,?,?,code 6,?,NaT,2017-02-15 07:09:00
141691,INC0120303,New,True,0,0,0,True,Caller 1866,Opened by 508,2017-02-15 01:52:00,...,False,Do Not Notify,?,?,?,?,code 6,?,NaT,2017-02-15 01:52:00
141689,INC0120268,New,True,0,0,0,True,Caller 1057,Opened by 508,2017-02-14 18:31:00,...,False,Do Not Notify,?,?,?,?,code 6,?,NaT,2017-02-14 18:31:00
141687,INC0119987,New,True,0,0,0,True,Caller 831,Opened by 508,2017-02-14 11:15:00,...,False,Do Not Notify,?,?,?,?,code 6,?,NaT,2017-02-14 11:15:00


In [155]:
# Get list of variables except for the incident number
vars_ls = [variable for variable in list(new_state_src_df.columns) if variable != 'number']

# For each variable, count the number of possible levels and obtain modal
vars_levels_ls = []
population = new_state_src_df.shape[0]
for variable in vars_ls:
    vars_levels_count = len(new_state_src_df[variable].drop_duplicates())
    vars_modal_value = stats.mode(new_state_src_df[variable])[0]
    vars_modal_freq = stats.mode(new_state_src_df[variable])[1]
    vars_levels_ls += [{'variable': variable,
                        'levels_count': vars_levels_count,
                        'mode_value': vars_modal_value[0],
                        'values_modal_freq': vars_modal_freq[0],
                        'values_modal_pct': vars_modal_freq/population*100
                       }]

In [183]:
vars_levels_df = pd.DataFrame(vars_levels_ls)
vars_levels_df['values_modal_pct'] = [pct[0] for pct in vars_levels_df.values_modal_pct]
vars_levels_df.to_excel('vars_levels_df.xlsx', index=False)
vars_levels_df

Unnamed: 0,variable,levels_count,mode_value,values_modal_freq,values_modal_pct
0,incident_state,1,New,16397,100.0
1,active,1,True,16397,100.0
2,reassignment_count,1,0,16397,100.0
3,reopen_count,1,0,16397,100.0
4,sys_mod_count,4,0,16393,99.975605
5,made_sla,1,True,16397,100.0
6,caller_id,4523,Caller 4514,127,0.774532
7,opened_by,182,Opened by 17,5103,31.121547
8,opened_at,14114,2016-04-05T09:00:00.000000000,5,0.030493
9,sys_created_by,159,?,6450,39.336464


In [184]:
save_pickle('vars_levels_df.pickle', vars_levels_df)
vars_levels_df = load_pickle('vars_levels_df.pickle')

In [185]:
# Variables with the mode consisting of over 95% of the population
print('No. of variables with the mode cover over 95% population: {}'.format(vars_levels_df[vars_levels_df.values_modal_pct > 95].shape[0]))
vars_levels_df[vars_levels_df.values_modal_pct > 95]

No. of variables with the mode cover over 95% population: 17


Unnamed: 0,variable,levels_count,mode_value,values_modal_freq,values_modal_pct
0,incident_state,1,New,16397,100.0
1,active,1,True,16397,100.0
2,reassignment_count,1,0,16397,100.0
3,reopen_count,1,0,16397,100.0
4,sys_mod_count,4,0,16393,99.975605
5,made_sla,1,True,16397,100.0
13,contact_type,5,Phone,16205,98.829054
18,cmdb_ci,33,?,16361,99.780448
19,impact,3,2 - Medium,15796,96.334695
20,urgency,3,2 - Medium,15752,96.066354


Above variables are noted where 17 variables with the mode making up over 95% of the population.  Because of their low variability, they are to be excluded from the data sets for the models.

In [187]:
# Subsetting the data frames to those variables with the mode covering lower than 95%
new_state_src_subset_df = new_state_src_df
retain_vars_ls = ['number'] + list(vars_levels_df[vars_levels_df.values_modal_pct <= 95]['variable'])
new_state_src_subset_df = new_state_src_subset_df[retain_vars_ls]
save_pickle('new_state_src_subset_df.pickle', new_state_src_subset_df)
new_state_src_subset_df.head()

Unnamed: 0,number,caller_id,opened_by,opened_at,sys_created_by,sys_created_at,sys_updated_by,sys_updated_at,location,category,subcategory,u_symptom,assignment_group,assigned_to,knowledge,closed_code,resolved_by,resolved_at,closed_at
141698,INC0120495,Caller 323,Opened by 14,2017-02-15 11:58:00,?,NaT,Updated by 908,2017-02-15 11:58:00,Location 135,Category 59,Subcategory 210,Symptom 483,Group 73,?,False,?,?,NaT,2017-02-16 09:51:00
141696,INC0120319,Caller 1899,Opened by 508,2017-02-15 07:09:00,?,NaT,Updated by 908,2017-02-15 07:09:00,Location 246,Category 50,Subcategory 10,Symptom 533,Group 70,?,False,code 6,?,NaT,2017-02-15 07:09:00
141691,INC0120303,Caller 1866,Opened by 508,2017-02-15 01:52:00,?,NaT,Updated by 908,2017-02-15 01:52:00,Location 246,Category 50,Subcategory 10,Symptom 533,Group 70,?,False,code 6,?,NaT,2017-02-15 01:52:00
141689,INC0120268,Caller 1057,Opened by 508,2017-02-14 18:31:00,?,NaT,Updated by 908,2017-02-14 18:31:00,Location 246,Category 50,Subcategory 10,Symptom 533,Group 70,?,False,code 6,?,NaT,2017-02-14 18:31:00
141687,INC0119987,Caller 831,Opened by 508,2017-02-14 11:15:00,?,NaT,Updated by 908,2017-02-14 11:15:00,Location 246,Category 50,Subcategory 10,Symptom 533,Group 70,?,False,code 6,?,NaT,2017-02-14 11:15:00


In [91]:
# For each incident, compare their observations to find out the level of variability in each variable across the incident states
# Variables to review:
# active	reassignment_count	reopen_count	sys_mod_count	made_sla	caller_id	opened_by	opened_at	sys_created_by	sys_created_at	sys_updated_by	sys_updated_at	contact_type	location	category	subcategory	u_symptom	cmdb_ci	impact	urgency	priority	assignment_group	assigned_to	knowledge	u_priority_confirmation	notify	problem_id	rfc	vendor	caused_by	closed_code	resolved_by	resolved_at	closed_at
print('Generating the list of variables to count the levels...')
vars_ls = [v for v in list(src_df.columns) if v not in ['number']]
# Get distinct list of incident numbers
print('Generating the list of incidents...')
incident_number_ls = list(src_df.number.drop_duplicates())
print('No. of incidents - after stripping incident states: {}'.format(len(incident_number_ls)))

# For each incident number
incident_levels_cnt_ls = []
i = 0
print('Iterating through each incident:')
for number in incident_number_ls:
    if i % 2000 == 0:
        print('i = {}'.format(i))
    df = src_df[src_df.number == number]
    levels_count_dct = {'number': number}
    # For each variable to count the number of levels
    levels_count_dct_ls = [len(list(set(list(df[var_nm])))) for var_nm in vars_ls]
    # Add level counts to dictionary, with the variable name as the key
    levels_count_dct.update(zip(vars_ls, levels_count_dct_ls))
    # Append to the list
    incident_levels_cnt_ls += [levels_count_dct]
    i += 1
save_pickle('incident_levels_cnt_ls.pickle', incident_levels_cnt_ls)

Generating the list of variables to count the levels...
Generating the list of incidents...
No. of incidents - after stripping incident states: 16397
Iterating through each incident:
i = 0
i = 2000
i = 4000
i = 6000
i = 8000
i = 10000
i = 12000
i = 14000
i = 16000


In [92]:
incident_levels_cnt_ls = load_pickle('incident_levels_cnt_ls.pickle')
incident_levels_cnt_df = pd.DataFrame(incident_levels_cnt_ls)
incident_levels_cnt_df.reset_index(drop=True)
incident_levels_cnt_df.head()

Unnamed: 0,number,incident_state,active,reassignment_count,reopen_count,sys_mod_count,made_sla,caller_id,opened_by,opened_at,...,u_priority_confirmation,notify,problem_id,rfc,vendor,caused_by,closed_code,resolved_by,resolved_at,closed_at
0,INC0000045,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,INC0000047,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,INC0000057,1,1,1,1,5,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
3,INC0000060,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,INC0000062,1,1,2,1,6,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [93]:
# Aggregate the levels count by min, max, mean, modal, and variances of mean to modal
vars_ls = list(incident_levels_cnt_df.columns)[1:]
incident_levels_cnt_aggr_dct_ls = []
for variable in vars_ls:
    values_ls = incident_levels_cnt_df[variable].values
    values_min = np.min(values_ls)
    values_max = np.max(values_ls)
    values_mean = np.mean(values_ls)
    values_modal_value = stats.mode(values_ls)[0][0]
    values_modal_size = stats.mode(values_ls)[1][0]
    values_mean_modal_diff = values_mean - values_modal_value
    incident_levels_cnt_aggr_dct_ls += [{'variable': variable,
                                         'min': values_min,
                                         'mean': values_mean,
                                         'modal_value': values_modal_value,
                                         'modal_size': values_modal_size,
                                         'mean_modal_var': values_mean_modal_diff,
                                         'max': values_max
                                        }]
# Create a summary data frame and save to file
each_incident_levels_cnt_aggr_df = pd.DataFrame(incident_levels_cnt_aggr_dct_ls)
each_incident_levels_cnt_aggr_df.to_excel('each_incident_levels_cnt_aggr_df.xlsx', index=False)
each_incident_levels_cnt_aggr_df

Unnamed: 0,variable,min,mean,modal_value,modal_size,mean_modal_var,max
0,incident_state,1,1.0,1,16397,0.0,1
1,active,1,1.0,1,16397,0.0,1
2,reassignment_count,1,1.665671,1,9825,0.665671,14
3,reopen_count,1,1.0,1,16397,0.0,1
4,sys_mod_count,1,2.220284,1,7614,1.220284,23
5,made_sla,1,1.0,1,16397,0.0,1
6,caller_id,1,1.000122,1,16395,0.000122,2
7,opened_by,1,1.0,1,16397,0.0,1
8,opened_at,1,1.0,1,16397,0.0,1
9,sys_created_by,1,1.0,1,16397,0.0,1


From the above, the following variables have one or nearly one level for each incident based on the value of mean and mode of the number of levels per variable.  Therefore, in these case, the mode is taken as the value upon aggregating the different observations per incident:
- incident_state
- active
- opened_by
- opened_at
- sys_created_by
- sys_created_at
- notify
- closed_code
- resolved_by
- resolved_at
- closed_at
- vendor
- rfc
- porblem_id
- priority
- urgency
- impact
- cmdb_ci
- u_symptom
- subcategory
- category
- location
- contact_type
- caller_id
- made_sla
- reopen_count
- reassignment_count
- active

Whereas the following columns have high variability within each incident for a given incident state based on the difference between mode and mean.  This indicates skewness in the levels count distributions:
- sys_mod_count (max at 58)
- sys_updated_by (max at 18)
- sys_updated_at (max at 42)

As for the incident_state values itself, the aggregation will be by the count of each possible state per incident.

In [58]:
incidents_src_df.columns

Index(['number', 'incident_state', 'active', 'reassignment_count',
       'reopen_count', 'sys_mod_count', 'made_sla', 'caller_id', 'opened_by',
       'opened_at', 'sys_created_by', 'sys_created_at', 'sys_updated_by',
       'sys_updated_at', 'contact_type', 'location', 'category', 'subcategory',
       'u_symptom', 'cmdb_ci', 'impact', 'urgency', 'priority',
       'assignment_group', 'assigned_to', 'knowledge',
       'u_priority_confirmation', 'notify', 'problem_id', 'rfc', 'vendor',
       'caused_by', 'closed_code', 'resolved_by', 'resolved_at', 'closed_at'],
      dtype='object')

In [59]:
# Convert to appropriate data types
incidents_src_df.active = [1 if 'True' else 0 for x in incidents_src_df.active]

# Create factors for each category in a categorical variable
def create_factors(pd_series):
    pd_factor = pd.factorize(pd_series)
    dct = dict(zip(pd_factor[1], range(0, len(pd_factor[1]))))
    return dct

# Re-map categorical values from string to integer ordinal values
def to_factors(dct, pd_series):
    new_ls = [dct.get(x) for x in pd_series]
    return new_ls

var_to_factors_ls = ['incident_state', 'made_sla', 'caller_id', 'sys_created_by', 'opened_by', 'closed_code', 'resolved_by']
var_factors_dct_ls = []

incidents_factors_df = incidents_src_df[['number']]

for variable in var_to_factors_ls:
    print(variable)
    factors_dct = create_factors(incidents_src_df[variable])
    var_factors_dct_ls += [(variable, factors_dct)]
    incidents_factors_df[variable] = to_factors(factors_dct, incidents_src_df[variable])
    
incidents_factors_df.head()

incident_state
made_sla
caller_id
sys_created_by

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



opened_by
closed_code
resolved_by


Unnamed: 0,number,incident_state,made_sla,caller_id,sys_created_by,opened_by,closed_code,resolved_by
0,INC0000045,0,0,0,0,0,0,0
1,INC0000045,1,0,0,0,0,0,0
2,INC0000045,1,0,0,0,0,0,0
4,INC0000047,0,0,0,1,1,0,1
5,INC0000047,2,0,0,1,1,0,1


In [52]:
# Append the integer columns
pd.concat([incidents_factors_df, incidents_src_df[['reassignment_count', 'reopen_count', 'sys_mod_count']]], axis=1)

Unnamed: 0,number,incident_state,made_sla,caller_id,sys_created_by,opened_by,closed_code,resolved_by,reassignment_count,reopen_count,sys_mod_count
3,INC0000045,0,0,0,0,0,0,0,0,0,4
12,INC0000047,0,0,0,1,1,0,1,1,0,8
19,INC0000057,0,0,1,2,0,1,2,0,0,6
23,INC0000060,0,0,2,3,2,2,3,0,0,3
31,INC0000062,0,1,3,3,2,3,4,1,0,7
...,...,...,...,...,...,...,...,...,...,...,...
141695,INC0120304,0,0,1131,2,0,4,2,0,0,2
141697,INC0120319,0,0,5244,2,206,4,135,0,0,1
141703,INC0120495,0,0,1172,2,169,13,135,1,0,5
141707,INC0120835,0,0,2816,2,71,9,112,1,0,4


In [50]:
incidents_src_df.head()

Unnamed: 0,number,incident_state,active,reassignment_count,reopen_count,sys_mod_count,made_sla,caller_id,opened_by,opened_at,...,u_priority_confirmation,notify,problem_id,rfc,vendor,caused_by,closed_code,resolved_by,resolved_at,closed_at
3,INC0000045,Closed,1,0,0,4,True,Caller 2403,Opened by 8,29/2/2016 01:16,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 149,29/2/2016 11:29,5/3/2016 12:00
12,INC0000047,Closed,1,1,0,8,True,Caller 2403,Opened by 397,29/2/2016 04:40,...,False,Do Not Notify,?,?,?,?,code 5,Resolved by 81,1/3/2016 09:52,6/3/2016 10:00
19,INC0000057,Closed,1,0,0,6,True,Caller 4416,Opened by 8,29/2/2016 06:10,...,False,Do Not Notify,Problem ID 2,?,?,?,code 10,Resolved by 5,1/3/2016 02:55,6/3/2016 03:00
23,INC0000060,Closed,1,0,0,3,True,Caller 4491,Opened by 180,29/2/2016 06:38,...,False,Do Not Notify,?,?,?,?,code 3,Resolved by 113,2/3/2016 12:06,7/3/2016 13:00
31,INC0000062,Closed,1,1,0,7,False,Caller 3765,Opened by 180,29/2/2016 06:58,...,False,Do Not Notify,?,?,?,?,code 7,Resolved by 62,29/2/2016 15:51,5/3/2016 16:00


In [44]:
# Separate dependent and independent variables
x_var_ls = ['number', 'incident_state', 'active', 'reassignment_count',
       'reopen_count', 'sys_mod_count', 'made_sla', 'caller_id', 'opened_by',
       'opened_at', 'sys_created_by', 'sys_created_at', 'sys_updated_by',
       'sys_updated_at', 'contact_type', 'location', 'category', 'subcategory',
       'u_symptom', 'cmdb_ci', 'impact', 'urgency', 'priority',
       'assignment_group', 'assigned_to', 'knowledge',
       'u_priority_confirmation', 'notify', 'problem_id', 'rfc', 'vendor']

fx_var_ls = ['opened_at', 'caused_by', 'closed_code', 'resolved_by', 'resolved_at', 'closed_at'] 

incidents_x_df = incidents_src_df[x_var_ls]
incidents_fx_df = incidents_src_df[fx_var_ls]

In [18]:
# Calculate resolution duration - use as prediction target
incidents_fx_df['duration'] = incidents_fx_df.closed_at - incidents_fx_df.opened_at

TypeError: unsupported operand type(s) for -: 'str' and 'str'