In [17]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [18]:
df = pd.read_csv(r'stpa-step1-dataset.csv')
display(df)
print(df['label'].value_counts())

Unnamed: 0,sentence,label,domain,year,title,url,slide,obs
0,Humans and/or human assets on earth are killed...,loss,aerospace,2012,STPA and CAST Tutorial,http://psas.scripts.mit.edu/home/get_pdf.php?n...,13,
1,Humans and/or human assets off of the earth ar...,loss,aerospace,2012,STPA and CAST Tutorial,http://psas.scripts.mit.edu/home/get_pdf.php?n...,13,
2,Organisms on any of the moons of the outer pla...,loss,aerospace,2012,STPA and CAST Tutorial,http://psas.scripts.mit.edu/home/get_pdf.php?n...,13,
3,The scientific data corresponding to the missi...,loss,aerospace,2012,STPA and CAST Tutorial,http://psas.scripts.mit.edu/home/get_pdf.php?n...,13,
4,The scientific data is rendered unusable (e.g....,loss,aerospace,2012,STPA and CAST Tutorial,http://psas.scripts.mit.edu/home/get_pdf.php?n...,14,
...,...,...,...,...,...,...,...,...
1073,The Laboratory Supervisors must be a part of t...,constraint,laboratory,2023,STPA Applied for Energetic Materials Handling ...,https://psas.scripts.mit.edu/home/wp-content/u...,13,
1074,Operators must conclude operational laboratory...,constraint,laboratory,2023,STPA Applied for Energetic Materials Handling ...,https://psas.scripts.mit.edu/home/wp-content/u...,14,
1075,Operational laboratory trainings must approach...,constraint,laboratory,2023,STPA Applied for Energetic Materials Handling ...,https://psas.scripts.mit.edu/home/wp-content/u...,14,
1076,The Laboratory Managers must periodically impl...,constraint,laboratory,2023,STPA Applied for Energetic Materials Handling ...,https://psas.scripts.mit.edu/home/wp-content/u...,14,


label
hazard          408
constraint      316
loss            254
exloss           47
exhazard         34
exconstraint     19
Name: count, dtype: int64


In [19]:
experiment = '1' # @param ['1', '2']

In [20]:
if(experiment=='1'):
  df = df[~df['label'].isin(['exconstraint', 'exloss', 'exhazard'])]

elif(experiment=='2'):
  df['label'] =  df['label'].replace(['exconstraint'], 'constraint')
  df['label'] =  df['label'].replace(['exloss'], 'loss')
  df['label'] =  df['label'].replace(['exhazard'], 'hazard')

print(df['label'].value_counts())

y = df['label'].to_list()
x = df['sentence'].to_list()
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=0, test_size=0.2, stratify=y)

print(len(x_train), len(x_test))

label
hazard        408
constraint    316
loss          254
Name: count, dtype: int64
782 196


In [21]:
def print_results(name, label, prediction):
  print(name)
  print('Acc:       ', accuracy_score(label, prediction))
  print('precision: ', precision_score(label, prediction, average='macro'))
  print('recall:    ', recall_score(label, prediction, average='macro'))
  print('F1:        ', f1_score(label, prediction, average='macro'))

In [22]:
pipe_mnb = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('nb', MultinomialNB(alpha=1)),
])

param_mnb = {
    'nb__alpha': [0.0001, 0.001, 0.1, 1, 10, 100, 1000],
}

gsearch_mnb = GridSearchCV(pipe_mnb, param_mnb, scoring='accuracy', cv=5, refit=True)
gsearch_mnb.fit(x_train, y_train)
print(gsearch_mnb.best_params_)

{'nb__alpha': 1}


In [23]:
pipe_svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('svm', SVC(random_state=0))
])

param_svm = {
    'svm__gamma': [0.0001, 0.001, 0.1, 1, 10],
    'svm__C': [0.0001, 0.001, 0.1, 1, 10],
    'svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

gsearch_svm = GridSearchCV(pipe_svm, param_svm, scoring='accuracy', cv=5, refit=True)
gsearch_svm.fit(x_train, y_train)
print(gsearch_svm.best_params_)

{'svm__C': 1, 'svm__gamma': 0.0001, 'svm__kernel': 'linear'}


In [24]:
#Test classifiers

pipe_mnb_predicted_test = gsearch_mnb.predict(x_test)
print_results('MultinomialNB', y_test, pipe_mnb_predicted_test)

pipe_svm_predicted_test = gsearch_svm.predict(x_test)
print_results('SVM', y_test, pipe_svm_predicted_test)

MultinomialNB
Acc:        0.8571428571428571
precision:  0.8710554880894369
recall:     0.8639786537921402
F1:         0.8629277717072862
SVM
Acc:        0.9540816326530612
precision:  0.967032967032967
recall:     0.9424214130096483
F1:         0.9516234985446644
