In [1]:
!pip install missingno --quiet

In [2]:
from sklearn import set_config
set_config(transform_output = "pandas")
import numpy as np
import pandas as pd
import missingno as msno
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
set_config(display='diagram')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras import models
from sklearn.impute import SimpleImputer
from tensorflow.keras.layers import Embedding, Flatten, Dense
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.5f}'.format)
import pickle

In [4]:
%%time
# Load data
data = pd.read_csv('incident_event_log-Copy1.csv')
#x = int(0.05*data.shape[0])
#data = data.head(x)
data.replace('?', np.nan, inplace=True)

# Handle missing values
missing_values_dict = {col: data[col].isnull().sum() for col in data.columns if data[col].isnull().sum() > 0}
data.drop(['number', 'caller_id', 'resolved_at', 'opened_by', 'assigned_to', 'assignment_group', 
           'u_symptom', 'sys_created_at', 'sys_created_by', 'cmdb_ci', 'problem_id', 'rfc', 
           'vendor', 'caused_by'], axis=1, inplace=True)

data['closed_at'] = (pd.to_datetime(data['closed_at'], format='%d/%m/%Y %H:%M') - pd.to_datetime(data['opened_at'], format='%d/%m/%Y %H:%M')).dt.days
data['sys_updated_at'] = (pd.to_datetime(data['sys_updated_at'], format='%d/%m/%Y %H:%M') - pd.to_datetime(data['opened_at'], format='%d/%m/%Y %H:%M')).dt.days
data.drop(['opened_at'],axis=1,inplace=True)

# Define variable groups
ordinal_variable = ['incident_state', 'impact', 'urgency', 'priority', 'u_priority_confirmation', 'notify']
nominal_variable = ['active', 'made_sla', 'contact_type', 'knowledge']
to_continuous_variable = ['sys_updated_by', 'location', 'category', 'subcategory', 'closed_code', 'resolved_by']
continuous_variable = ['reassignment_count', 'reopen_count', 'sys_mod_count', 'sys_updated_at']

# Split data
X_train, X_test, y_train, y_test = train_test_split(data.drop('closed_at', axis=1), data['closed_at'],
                                                    test_size=0.3, random_state=42)

#print (type(X_train))


class Cat2Vec(BaseEstimator, TransformerMixin):
    def __init__(self, nominal_variable):
        self.nominal_variable = nominal_variable
        postfix='_embeddings_dict'
        for  var in self.nominal_variable:
            var_name = f"{var}{postfix}"
            #print (var_name)
            setattr(self, var_name, None)

    def fit(self, X, y=None):
        #self.y=y
        #print (type(X))
        all_embeddings_df = X[self.nominal_variable]
        #print (all_embeddings_df.head())
        print ('Inside Cat2Vec.transform()')
        for variable in self.nominal_variable:
            final_embeddings_df = pd.DataFrame()
            print ('processing ', variable)
            #for variable in nominal_variable:
            encoder = OrdinalEncoder()
            #variable = 'active'
            temp_df=pd.DataFrame()
            temp_df[variable] = X[variable]
            temp_df[variable +'_encoded']=encoder.fit_transform(temp_df[variable].to_frame())
            num_categories = len(temp_df[variable +'_encoded'].unique())
            output_dim = int(num_categories**0.5)+1
            scaler=StandardScaler()
            y_scaled = scaler.fit_transform(y.to_frame())
            #print (temp_df[variable +'_encoded'].shape)
            #print (y_scaled.shape)
            model = models.Sequential()
            model.add(Embedding(input_dim = num_categories, output_dim = int(num_categories**0.5) + 1, input_length = 1, name="embedding"))
            model.add(Flatten())
            model.add(Dense(512, activation="relu"))
            model.add(Dense(256, activation="relu"))
            model.add(Dense(128, activation="relu"))
            model.add(Dense(64, activation="relu"))
            model.add(Dense(32, activation="relu"))
            model.add(Dense(1))
            model.compile(loss = "huber", optimizer = "adam", metrics=["mse"])
            model.fit(x = temp_df[variable +'_encoded'], y=y_scaled , epochs = 15)

            layer = model.get_layer('embedding')
            output_embeddings = layer.get_weights()
            #output_embeddings[0]
            
            embeddings_dict_name = f'self.{variable}_embeddings_dict'

            embeddings_dict = {i: list(output_embeddings[0][i]) for i in range(output_embeddings[0].shape[0])}
            
            setattr(self,embeddings_dict_name, embeddings_dict)
            

        return self

    def transform(self, X):
        #print (type(X))
        all_embeddings_df = X[self.nominal_variable]
        #print (all_embeddings_df.head())
        print ('Inside Cat2Vec.transform()')
        for variable in self.nominal_variable:
            #final_embeddings_df = pd.DataFrame()
            print ('Transforming inside cat2vec for ', variable)
            #for variable in nominal_variable:
            encoder = OrdinalEncoder()
            #variable = 'active'
            temp_df=pd.DataFrame()
            temp_df[variable] = X[variable]
            temp_df[variable +'_encoded']=encoder.fit_transform(temp_df[variable].to_frame())
            num_categories = len(temp_df[variable +'_encoded'].unique())
            output_dim = int(num_categories**0.5)+1
            #scaler=StandardScaler()
            #y_scaled = scaler.fit_transform(self.y.to_frame())
            #print (temp_df[variable +'_encoded'].shape)
            #print (y_scaled.shape)
            #model = models.Sequential()
            #model.add(Embedding(input_dim = num_categories, output_dim = int(num_categories**0.5) + 1, input_length = 1, name="embedding"))
            #model.add(Flatten())
            #model.add(Dense(256, activation="relu"))
            #model.add(Dense(128, activation="relu"))
            #model.add(Dense(64, activation="relu"))
            #model.add(Dense(32, activation="relu"))
            #model.add(Dense(1))
            #model.compile(loss = "mse", optimizer = "adam", metrics=["mse"])
            #model.fit(x = temp_df[variable +'_encoded'], y=y_scaled , epochs = 5)

            #layer = model.get_layer('embedding')
            #output_embeddings = layer.get_weights()
            #output_embeddings[0]

            #self.{variable}_embeddings_dict = {i: list(output_embeddings[0][i]) for i in range(output_embeddings[0].shape[0])}
            
            #save embeddings to use in predict step
            #with open(f'{variable}_embedding.pkl', 'wb') as f:
                #pickle.dump(embeddings_dict, f)
            embeddings_dict_name = f'self.{variable}_embeddings_dict'
            
            embeddings_dict = getattr(self,embeddings_dict_name)
            
            for index, row in temp_df.iterrows():
                for i in range(output_dim):
                    temp_df.loc[index,variable+'_embedding_'+str(i)] = embeddings_dict[temp_df.loc[index,variable+'_encoded']][i]

            #temp_df
            final_embeddings_df = temp_df.drop(columns=[variable, variable+'_encoded'])
            all_embeddings_df= pd.concat([all_embeddings_df,final_embeddings_df],axis=1)
        #all_embeddings_df.to_csv('nominal_embeddings.csv',index=True)
        all_embeddings_df.drop(self.nominal_variable, axis=1, inplace=True)
        return (all_embeddings_df)


class Cat2Vec_wpos(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_variable):
        self.ordinal_variable = ordinal_variable
        postfix='_embeddings_dict'
        for  var in self.ordinal_variable:
            var_name = f"{var}{postfix}"
            #print (var_name)
            setattr(self, var_name, None)
        postfix='_pos_embeddings_dict'
        for  var in self.ordinal_variable:
            var_name = f"{var}{postfix}"
            #print (var_name)
            setattr(self, var_name, None)
    
    def getPositionEncoding(self,seq_len, d, n=10000):
        #print (seq_len)
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P

    def fit(self, X, y=None):
        #self.y=y
        #print (type(X))
        all_embeddings_df = X[self.ordinal_variable]
        #print ('Inside Cat2Vec_wpos.fit()')
        for variable in self.ordinal_variable:
            print ('processing inside cat2vec_wpos for ', variable)
            final_embeddings_df = pd.DataFrame()
            #for variable in nominal_variable:
            encoder = OrdinalEncoder()
            #variable = 'active'
            temp_df=pd.DataFrame()
            temp_df[variable] = X[variable]
            temp_df[variable +'_encoded']=encoder.fit_transform(temp_df[variable].to_frame())
            num_categories = len(temp_df[variable +'_encoded'].unique())
            output_dim = int(num_categories**0.5)+1
            scaler=StandardScaler()
            y_scaled = scaler.fit_transform(y.to_frame())
            #print (temp_df[variable +'_encoded'].shape)
            #print (y_scaled.shape)
            model = models.Sequential()
            model.add(Embedding(input_dim = num_categories, output_dim = int(num_categories**0.5) + 1, input_length = 1, name="embedding"))
            model.add(Flatten())
            model.add(Dense(512, activation="relu"))
            model.add(Dense(256, activation="relu"))
            model.add(Dense(128, activation="relu"))
            model.add(Dense(64, activation="relu"))
            model.add(Dense(32, activation="relu"))
            model.add(Dense(1))
            model.compile(loss = "huber", optimizer = "adam", metrics=["mse"])
            model.fit(x = temp_df[variable +'_encoded'], y=y_scaled , epochs = 15)

            layer = model.get_layer('embedding')
            output_embeddings = layer.get_weights()
            #output_embeddings[0]
            
            embeddings_dict_name = f'self.{variable}_embeddings_dict'

            embeddings_dict = {i: list(output_embeddings[0][i]) for i in range(output_embeddings[0].shape[0])}
            
            setattr(self,embeddings_dict_name, embeddings_dict)
            
            #with open(f'{variable}_embedding.pkl', 'wb') as f:
                #pickle.dump(embeddings_dict, f)
            
            #print (num_categories)
            
            position_embeddings = self.getPositionEncoding(seq_len=num_categories, d=output_dim, n=100)
            
            #print(position_embeddings)
            
            pos_embeddings_dict_name = f'self.{variable}_position_embeddings_dict'
            
            pos_embeddings_dict = {i: list(position_embeddings[i]) for i in range(position_embeddings.shape[0])}
            
            setattr(self,pos_embeddings_dict_name,pos_embeddings_dict)
            
            #print (position_embeddings_dict)
            
            #with open(f'{variable}_pos_embedding.pkl', 'wb') as f:
                #pickle.dump(position_embeddings_dict, f)
                
        return self
    


    def transform(self, X):
        #print (type(X))
        all_embeddings_df = X[self.ordinal_variable]
        print ('Inside Cat2Vec_wpos.transform()')
        for variable in self.ordinal_variable:
            print ('processing ', variable)
            final_embeddings_df = pd.DataFrame()
            #for variable in nominal_variable:
            encoder = OrdinalEncoder()
            #variable = 'active'
            temp_df=pd.DataFrame()
            temp_df[variable] = X[variable]
            temp_df[variable +'_encoded']=encoder.fit_transform(temp_df[variable].to_frame())
            num_categories = len(temp_df[variable +'_encoded'].unique())
            output_dim = int(num_categories**0.5)+1
            scaler=StandardScaler()
            #y_scaled = scaler.fit_transform(self.y.to_frame())
            #print (temp_df[variable +'_encoded'].shape)
            #print (y_scaled.shape)
            #model = models.Sequential()
            #model.add(Embedding(input_dim = num_categories, output_dim = int(num_categories**0.5) + 1, input_length = 1, name="embedding"))
            #model.add(Flatten())
            #model.add(Dense(256, activation="relu"))
            #model.add(Dense(128, activation="relu"))
            #model.add(Dense(64, activation="relu"))
            #model.add(Dense(32, activation="relu"))
            #model.add(Dense(1))
            #model.compile(loss = "mse", optimizer = "adam", metrics=["mse"])
            #model.fit(x = temp_df[variable +'_encoded'], y=y_scaled , epochs = 5)

            #layer = model.get_layer('embedding')
            #output_embeddings = layer.get_weights()
            #output_embeddings[0]

            #embeddings_dict = {i: list(output_embeddings[0][i]) for i in range(output_embeddings[0].shape[0])}
            
            #with open(f'{variable}_embedding.pkl', 'wb') as f:
                #pickle.dump(embeddings_dict, f)
            
            #print (num_categories)
            
            #position_embeddings = self.getPositionEncoding(seq_len=num_categories, d=output_dim, n=100)
            #print(position_embeddings)
            #position_embeddings_dict = {i: list(position_embeddings[i]) for i in range(position_embeddings.shape[0])}
            #print (position_embeddings_dict)
            
            #with open(f'{variable}_pos_embedding.pkl', 'wb') as f:
                #pickle.dump(position_embeddings_dict, f)
                
            embeddings_dict_name = f'self.{variable}_embeddings_dict'
            #print (embeddings_dict)
            pos_embeddings_dict_name = f'self.{variable}_position_embeddings_dict'
            #print (pos_embeddings_dict)
            
            embeddings_dict = getattr(self,embeddings_dict_name)
            
            pos_embeddings_dict = getattr(self,pos_embeddings_dict_name)

            for index, row in temp_df.iterrows():
                for i in range(output_dim):
                    temp_df.loc[index,variable+'_embedding_'+str(i)] = embeddings_dict[temp_df.loc[index,variable+'_encoded']][i]
                for i in range(output_dim):
                    temp_df.loc[index,variable+'_pos_embedding_'+str(i)] = pos_embeddings_dict[temp_df.loc[index,variable+'_encoded']][i]

            #temp_df
            final_embeddings_df = temp_df.drop(columns=[variable, variable+'_encoded'])
            all_embeddings_df= pd.concat([all_embeddings_df,final_embeddings_df],axis=1)
        #all_embeddings_df.to_csv('ordinal_embeddings.csv',index=True)
        all_embeddings_df.drop(self.ordinal_variable, axis=1, inplace=True)
        return (all_embeddings_df)


class ExtractNumeric(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        #print (X)
        #X = X.to_numpy()
        temp_df = pd.DataFrame()
        for col in X.columns:
            temp_df[col]=X[col].str.split().str[-1].astype(int)
        #temp_df.drop(X.columns,axis=1,inplace=True)
        #print (temp_df)
        return temp_df

# Define transformers
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    #('cat2vec_wpos_encoder', Cat2Vec_wpos(ordinal_variable=ordinal_variable)),
    ('ordinal_encoder', OrdinalEncoder()),
    ('scaler',StandardScaler())
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('cat2vec_encoder', Cat2Vec(nominal_variable=nominal_variable)),
    ('scaler',StandardScaler())
])

contionous_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())])

to_contionous_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('extract_numeric', ExtractNumeric()),
            ('scaler', StandardScaler())
        ])

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_transformer, ordinal_variable),
        ('nominal', nominal_transformer, nominal_variable),
        ('continuous', contionous_transformer, continuous_variable),
        ('to_continuous', to_contionous_transformer, to_continuous_variable)
    ],
    remainder='passthrough'
)

# Define models
rf_model = RandomForestRegressor(random_state=42, max_depth=10,min_samples_split=5,min_samples_leaf=5)
svm_model = SVR()
xgb_model = XGBRegressor(random_state=42)
lr_model = LinearRegression()
knn_model = KNeighborsRegressor()

preprocessor.fit(X_train,y_train)
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

rf_model.fit(X_train_transformed, y_train)
svm_model.fit(X_train_transformed, y_train)
xgb_model.fit(X_train_transformed, y_train)
lr_model.fit(X_train_transformed, y_train)
knn_model.fit(X_train_transformed, y_train)

# Define pipelines
#rf_pipeline = make_pipeline(preprocessor, rf_model)
#svm_pipeline = make_pipeline(preprocessor, svm_model)
#xgb_pipeline = make_pipeline(preprocessor, xgb_model)

# Fit pipelines
#rf_pipeline.fit(X_train, y_train)
#svm_pipeline.fit(X_train, y_train)
#xgb_pipeline.fit(X_train, y_train)






Inside Cat2Vec.transform()
processing  active
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
processing  made_sla
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
processing  contact_type
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
processing  knowledge
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Inside Cat2Vec.transform()
Transforming inside cat2vec for  active
Transforming inside cat2vec for  made_sla
Transforming inside cat2vec for  contact_type
Transforming inside cat2vec for  knowledge
Insi

In [5]:
# Predictions and calculate MSE for train and test sets
y_pred_train_rf = rf_model.predict(X_train_transformed)
mse_train_rf = mean_squared_error(y_train, y_pred_train_rf)


y_pred_test_rf = rf_model.predict(X_test_transformed)
mse_test_rf = mean_squared_error(y_test, y_pred_test_rf)



y_pred_train_svm = svm_model.predict(X_train_transformed)
mse_train_svm = mean_squared_error(y_train, y_pred_train_svm)


y_pred_test_svm = svm_model.predict(X_test_transformed)
mse_test_svm = mean_squared_error(y_test, y_pred_test_svm)



y_pred_train_xgb = xgb_model.predict(X_train_transformed)
mse_train_xgb = mean_squared_error(y_train, y_pred_train_xgb)


y_pred_test_xgb = xgb_model.predict(X_test_transformed)
mse_test_xgb = mean_squared_error(y_test, y_pred_test_xgb)


y_pred_train_lr = lr_model.predict(X_train_transformed)
mse_train_lr = mean_squared_error(y_train, y_pred_train_lr)

y_pred_test_lr = lr_model.predict(X_test_transformed)
mse_test_lr = mean_squared_error(y_test, y_pred_test_lr)

y_pred_train_knn = knn_model.predict(X_train_transformed)
mse_train_knn = mean_squared_error(y_train, y_pred_train_knn)

y_pred_test_knn = knn_model.predict(X_test_transformed)
mse_test_knn = mean_squared_error(y_test, y_pred_test_knn)

print (' Train MSE for all models ')
print(f"Train MSE for Random Forest : {mse_train_rf}")
print(f"Test MSE for Random Forest : {mse_test_rf}")
print(f"Train MSE for SVM : {mse_train_svm}")
print (' Test MSE for all models ')
print(f"Test MSE for SVM : {mse_test_svm}")
print(f"Train MSE for XGBoost : {mse_train_xgb}")
print(f"Test MSE for XGBoost : {mse_test_xgb}")
print(f"Train MSE for Log Reg : {mse_train_lr}")
print(f"Test MSE for Log Reg : {mse_test_lr}")
print(f"Train MSE for KNN : {mse_train_knn}")
print(f"Test MSE for KNN : {mse_test_knn}")

 Train MSE for all models 
Train MSE for Random Forest : 249.8113013290791
Test MSE for Random Forest : 288.0928474108967
Train MSE for SVM : 412.23819132311394
 Test MSE for all models 
Test MSE for SVM : 429.916829663866
Train MSE for XGBoost : 136.16091170349094
Test MSE for XGBoost : 190.58222162149582
Train MSE for Log Reg : 337.02614295643536
Test MSE for Log Reg : 354.3453805859388
Train MSE for KNN : 167.16940543156113
Test MSE for KNN : 273.39856047419676


In [6]:
preprocessor