In [1]:
import pandas as pd
import numpy as np 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
df=pd.read_csv("C:/Users/Aiswarya/Downloads/final_alert_dataset.csv")
df

Unnamed: 0,Incident ID,Date,Time,Title,Description,Category,Priority,Status,Location,Department,Duration,Response Time,Resolution Time,Class
0,IN31001,25/06/2023,08:55:00,Application Bug,Users are experiencing unexpected errors in th...,Application,Medium,Open,Office,Applications,-,-,-,Application
1,IN1001,28/06/2022,03:22:55,Application Upgrade Incompatibility,An application upgrade introduces incompatibil...,Application Issue,High,Resolved,Server Room,Applications,60,20,40,Application
2,IN2001,26/12/2020,05:50:32,Database Query Timeout,Database queries are timing out causing delays...,Performance Issue,Medium,Resolved,Data Center,IT,45,15,30,Performance
3,IN3001,01/12/2023,14:10:00,Software License Expiry,The software license has expired.,Infrastructure,Low,Resolved,Office,IT,10,5,5,Infrastructure
4,IN4001,03/03/2021,13:35:32,Application Freeze or Hang,An application freezes or hangs becoming unres...,Performance,High,Resolved,Office,IT,15,5,10,Performance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,IN3001,23/09/2023,13:30:00,Data Corruption,Data integrity issues detected in the database.,Data,High,In Progress,Office,IT,-,-,-,Data
854,IN4001,04/06/2021,10:42:03,Server Room Power Surge,There is a power surge in the server room pote...,Infrastructure Issue,Critical,Resolved,Server Room,Infrastructure,45,15,30,Infrastructure
855,IN5001,09/07/2021,07:20:40,Data Center Cooling Failure,The cooling system in the data center has fail...,Infrastructure Issue,High,Resolved,Data Center,Infrastructure,90,30,60,Infrastructure
856,IN6001,22/10/2021,21:08:04,Social Engineering Incident,An employee falls victim to a social engineeri...,Security,Medium,Resolved,Office,IT,15,5,10,Security


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Incident ID      858 non-null    object
 1   Date             858 non-null    object
 2   Time             858 non-null    object
 3   Title            858 non-null    object
 4   Description      858 non-null    object
 5   Category         858 non-null    object
 6   Priority         858 non-null    object
 7   Status           858 non-null    object
 8   Location         858 non-null    object
 9   Department       858 non-null    object
 10  Duration         858 non-null    object
 11  Response Time    858 non-null    object
 12  Resolution Time  858 non-null    object
 13  Class            858 non-null    object
dtypes: object(14)
memory usage: 94.0+ KB


In [4]:
count=df['Class'].value_counts()
count

Application       176
Infrastructure    131
Data              102
Network           100
Performance        98
Security           94
Hardware           90
Configuration      67
Name: Class, dtype: int64

In [5]:
df['Title']=df['Title'].str.lower()
df['Title']

0                          application bug
1      application upgrade incompatibility
2                   database query timeout
3                  software license expiry
4               application freeze or hang
                      ...                 
853                        data corruption
854                server room power surge
855            data center cooling failure
856            social engineering incident
857               slow network performance
Name: Title, Length: 858, dtype: object

In [6]:
df['Description']=df['Description'].str.lower()
df['Description']

0      users are experiencing unexpected errors in th...
1      an application upgrade introduces incompatibil...
2      database queries are timing out causing delays...
3                      the software license has expired.
4      an application freezes or hangs becoming unres...
                             ...                        
853      data integrity issues detected in the database.
854    there is a power surge in the server room pote...
855    the cooling system in the data center has fail...
856    an employee falls victim to a social engineeri...
857    the network is experiencing slow performance c...
Name: Description, Length: 858, dtype: object

In [7]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens


In [8]:
# Tokenize the desired column and create a new column to store the tokens
df['tokenized_column'] = df['Description'].apply(tokenize_text)

# Print the tokenized column
print(df['tokenized_column'])

0      [users, are, experiencing, unexpected, errors,...
1      [an, application, upgrade, introduces, incompa...
2      [database, queries, are, timing, out, causing,...
3              [the, software, license, has, expired, .]
4      [an, application, freezes, or, hangs, becoming...
                             ...                        
853    [data, integrity, issues, detected, in, the, d...
854    [there, is, a, power, surge, in, the, server, ...
855    [the, cooling, system, in, the, data, center, ...
856    [an, employee, falls, victim, to, a, social, e...
857    [the, network, is, experiencing, slow, perform...
Name: tokenized_column, Length: 858, dtype: object


In [9]:
df.head()

Unnamed: 0,Incident ID,Date,Time,Title,Description,Category,Priority,Status,Location,Department,Duration,Response Time,Resolution Time,Class,tokenized_column
0,IN31001,25/06/2023,08:55:00,application bug,users are experiencing unexpected errors in th...,Application,Medium,Open,Office,Applications,-,-,-,Application,"[users, are, experiencing, unexpected, errors,..."
1,IN1001,28/06/2022,03:22:55,application upgrade incompatibility,an application upgrade introduces incompatibil...,Application Issue,High,Resolved,Server Room,Applications,60,20,40,Application,"[an, application, upgrade, introduces, incompa..."
2,IN2001,26/12/2020,05:50:32,database query timeout,database queries are timing out causing delays...,Performance Issue,Medium,Resolved,Data Center,IT,45,15,30,Performance,"[database, queries, are, timing, out, causing,..."
3,IN3001,01/12/2023,14:10:00,software license expiry,the software license has expired.,Infrastructure,Low,Resolved,Office,IT,10,5,5,Infrastructure,"[the, software, license, has, expired, .]"
4,IN4001,03/03/2021,13:35:32,application freeze or hang,an application freezes or hangs becoming unres...,Performance,High,Resolved,Office,IT,15,5,10,Performance,"[an, application, freezes, or, hangs, becoming..."


In [10]:
#stop word removal
stopwords=set(stopwords.words('english'))
def remove_stop(x):
    return ",".join([word for word in str(x).split() if word not in stopwords])
df['NewDescription']=df['tokenized_column'].apply(lambda x: remove_stop(x))
df['NewDescription']


0      ['users',,'are',,'experiencing',,'unexpected',...
1      ['an',,'application',,'upgrade',,'introduces',...
2      ['database',,'queries',,'are',,'timing',,'out'...
3      ['the',,'software',,'license',,'has',,'expired...
4      ['an',,'application',,'freezes',,'or',,'hangs'...
                             ...                        
853    ['data',,'integrity',,'issues',,'detected',,'i...
854    ['there',,'is',,'a',,'power',,'surge',,'in',,'...
855    ['the',,'cooling',,'system',,'in',,'the',,'dat...
856    ['an',,'employee',,'falls',,'victim',,'to',,'a...
857    ['the',,'network',,'is',,'experiencing',,'slow...
Name: NewDescription, Length: 858, dtype: object

In [11]:
df.head()

Unnamed: 0,Incident ID,Date,Time,Title,Description,Category,Priority,Status,Location,Department,Duration,Response Time,Resolution Time,Class,tokenized_column,NewDescription
0,IN31001,25/06/2023,08:55:00,application bug,users are experiencing unexpected errors in th...,Application,Medium,Open,Office,Applications,-,-,-,Application,"[users, are, experiencing, unexpected, errors,...","['users',,'are',,'experiencing',,'unexpected',..."
1,IN1001,28/06/2022,03:22:55,application upgrade incompatibility,an application upgrade introduces incompatibil...,Application Issue,High,Resolved,Server Room,Applications,60,20,40,Application,"[an, application, upgrade, introduces, incompa...","['an',,'application',,'upgrade',,'introduces',..."
2,IN2001,26/12/2020,05:50:32,database query timeout,database queries are timing out causing delays...,Performance Issue,Medium,Resolved,Data Center,IT,45,15,30,Performance,"[database, queries, are, timing, out, causing,...","['database',,'queries',,'are',,'timing',,'out'..."
3,IN3001,01/12/2023,14:10:00,software license expiry,the software license has expired.,Infrastructure,Low,Resolved,Office,IT,10,5,5,Infrastructure,"[the, software, license, has, expired, .]","['the',,'software',,'license',,'has',,'expired..."
4,IN4001,03/03/2021,13:35:32,application freeze or hang,an application freezes or hangs becoming unres...,Performance,High,Resolved,Office,IT,15,5,10,Performance,"[an, application, freezes, or, hangs, becoming...","['an',,'application',,'freezes',,'or',,'hangs'..."


In [38]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aiswarya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
#selecing features
X=df['NewDescription']
y=df['Class']

In [13]:
X.shape

(858,)

In [14]:
y.shape

(858,)

In [15]:
X.head()

0    ['users',,'are',,'experiencing',,'unexpected',...
1    ['an',,'application',,'upgrade',,'introduces',...
2    ['database',,'queries',,'are',,'timing',,'out'...
3    ['the',,'software',,'license',,'has',,'expired...
4    ['an',,'application',,'freezes',,'or',,'hangs'...
Name: NewDescription, dtype: object

In [16]:
vectorizer=TfidfVectorizer()
vectorizer

TfidfVectorizer()

In [17]:
column_name = 'NewDescription'
corpus = df[column_name]

In [18]:
tfidf_matrix = vectorizer.fit_transform(corpus)
tfidf_matrix

<858x1019 sparse matrix of type '<class 'numpy.float64'>'
	with 9171 stored elements in Compressed Sparse Row format>

In [19]:
feature_names = vectorizer.get_feature_names()
feature_names



['ability',
 'abnormal',
 'abnormally',
 'about',
 'abruptly',
 'acceptable',
 'accepting',
 'access',
 'accessible',
 'accessing',
 'accidentally',
 'account',
 'accounts',
 'accuracy',
 'accurate',
 'acls',
 'acquisition',
 'across',
 'action',
 'activated',
 'activates',
 'activities',
 'activity',
 'added',
 'additional',
 'address',
 'addresses',
 'adhere',
 'adjusted',
 'affected',
 'affecting',
 'affects',
 'after',
 'again',
 'air',
 'airflow',
 'alarm',
 'alarms',
 'alert',
 'align',
 'alignment',
 'all',
 'allocation',
 'allowed',
 'allowing',
 'amount',
 'an',
 'analyzing',
 'and',
 'anomalies',
 'another',
 'any',
 'api',
 'apis',
 'appliance',
 'application',
 'applications',
 'approval',
 'archiving',
 'are',
 'area',
 'areas',
 'arise',
 'array',
 'as',
 'assets',
 'assign',
 'assignment',
 'assignments',
 'attachment',
 'attachments',
 'attack',
 'attacker',
 'attacks',
 'attempted',
 'attempting',
 'attempts',
 'audio',
 'audit',
 'auditability',
 'audits',
 'authentic

In [20]:
for i in range(len(corpus)):
    print("word{}: ".format(i+1))
    for j, feature in enumerate(feature_names):
        tfidf_value = tfidf_matrix[i, j]
        if tfidf_value > 0:
            print("   {}: {}".format(feature, tfidf_value))

word1: 
   application: 0.23949173983628053
   are: 0.38220267703526284
   errors: 0.4239429503873512
   experiencing: 0.4353929822653639
   in: 0.20720251942310108
   the: 0.14074605761133524
   unexpected: 0.48499540934519286
   users: 0.35959318980465305
word2: 
   an: 0.16481224413083384
   application: 0.13659487354876979
   components: 0.3263673743928153
   incompatibility: 0.41467858690335185
   introduces: 0.41467858690335185
   issues: 0.1695813584740449
   or: 0.16357610289386953
   other: 0.41467858690335185
   systems: 0.3145850868126828
   upgrade: 0.36087899805412865
   with: 0.21696936112471893
word3: 
   and: 0.15415307320898175
   are: 0.2552364010225657
   causing: 0.18646418489886324
   data: 0.17356159829864784
   database: 0.20579922359425035
   delays: 0.2993616190298879
   in: 0.13837063034354738
   out: 0.33447910059070934
   processing: 0.3821305538455292
   queries: 0.3568507566217633
   retrieval: 0.36235341384964603
   timing: 0.4225388387984642
word4: 
   e

In [21]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix,y, test_size=0.2, random_state=42)


In [22]:
X_train.shape

(686, 1019)

In [23]:
y_train.shape

(686,)

In [24]:
X_test.shape

(172, 1019)

In [25]:
svm_model = SVC(kernel='linear')

# Train the SVM model
svm_model.fit(X_train, y_train)


SVC(kernel='linear')

In [26]:
y_pred = svm_model.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8488372093023255


In [28]:
def predict_class(description, vectorizer, model):
    # Convert the description into a TF-IDF matrix using the provided vectorizer
    tfidf = vectorizer.transform([description])

    # Predict the class for the description using the provided model
    prediction = model.predict(tfidf)

    return prediction

In [29]:
new_description = "A server is misconfigured resulting in operational issues or vulnerabilities."


# Concatenate the description and title
new_text = new_description 

# Predict the class for the new data
predicted_class = predict_class(new_description, vectorizer, svm_model)

# Print the predicted class
print("Predicted Class:", predicted_class)


Predicted Class: ['Configuration']


In [30]:
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision:", precision)
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall:", recall)
# Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1-score:", f1)



Accuracy: 0.8488372093023255
Precision: 0.8547307585825028
Recall: 0.8488372093023255
F1-score: 0.8478649993634794


In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier()

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = rf_model.predict(X_test)

In [33]:
def predict_class(description, vectorizer, model):
    # Convert the description into a TF-IDF matrix using the provided vectorizer
    tfidf = vectorizer.transform([description])

    # Predict the class for the description using the provided model
    prediction = model.predict(tfidf)

    return prediction

In [34]:
new_description = "Internet connection dropping frequently"


# Concatenate the description and title
new_text = new_description 

# Predict the class for the new data
predicted_class = predict_class(new_description, vectorizer, svm_model)

# Print the predicted class
print("Predicted Class:", predicted_class)


Predicted Class: ['Network']


In [35]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision:", precision)
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall:", recall)
# Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1-score:", f1)

Accuracy: 0.8197674418604651
Precision: 0.8224986966061718
Recall: 0.8197674418604651
F1-score: 0.8192178607280113


In [36]:
col=['NewDescription']
sp=df[col]
sp

Unnamed: 0,NewDescription
0,"['users',,'are',,'experiencing',,'unexpected',..."
1,"['an',,'application',,'upgrade',,'introduces',..."
2,"['database',,'queries',,'are',,'timing',,'out'..."
3,"['the',,'software',,'license',,'has',,'expired..."
4,"['an',,'application',,'freezes',,'or',,'hangs'..."
...,...
853,"['data',,'integrity',,'issues',,'detected',,'i..."
854,"['there',,'is',,'a',,'power',,'surge',,'in',,'..."
855,"['the',,'cooling',,'system',,'in',,'the',,'dat..."
856,"['an',,'employee',,'falls',,'victim',,'to',,'a..."
