In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import nltk

In [None]:
data = pd.read_csv('/content/synthetic_dataset.csv')
data

Unnamed: 0,Text,Label
0,The use of AI in military operations should pr...,1
1,Deploying AI for targeted assassinations is a ...,0
2,Developing AI-driven surveillance systems must...,1
3,Using AI in warfare to autonomously make life ...,0
4,AI-powered medical assistance in conflict zone...,1
...,...,...
166,The use of autonomous weapons must adhere to i...,1
167,Deploying AI in a manner that causes harm to c...,0
168,Developing AI for defensive purposes to protec...,1
169,Using AI to target civilian populations is a c...,0


In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['Text'] = data['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
data['Text'] = data.Text.apply(lemmatize_text)
data

Unnamed: 0,Text,Label
0,The use AI military operation prioritize minim...,1
1,Deploying AI targeted assassination clear viol...,0
2,Developing AI-driven surveillance system must ...,1
3,Using AI warfare autonomously make life death ...,0
4,AI-powered medical assistance conflict zone co...,1
...,...,...
166,The use autonomous weapon must adhere internat...,1
167,Deploying AI manner cause harm civilian strict...,0
168,Developing AI defensive purpose protect human ...,1
169,Using AI target civilian population clear viol...,0


In [None]:
s= 0.0
for i in data['Text']:
    word_list = i.split()
    s = s + len(word_list)
print("Average length of each Text : ",s/data.shape[0])
pos = 0
for i in range(data.shape[0]):
    if data.iloc[i]['Label'] == 1:
        pos = pos + 1
neg = data.shape[0]-pos
print("Percentage of text with Ethical  is "+str(pos/data.shape[0]*100)+"%")
print("Percentage of text with Unethical  is "+str(neg/data.shape[0]*100)+"%")

Average length of each Text :  11.163742690058479
Percentage of text with Ethical  is 56.72514619883041%
Percentage of text with Unethical  is 43.27485380116959%


In [None]:
text = data['Text'].values
labels = data['Label'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [None]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(text, encoded_labels, stratify = encoded_labels)

In [None]:
# Hyperparameters of the model
vocab_size = 3000
oov_tok = ''
embedding_dim = 100
max_length = 200 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [None]:
# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 100)          300000    
                                                                 
 bidirectional_3 (Bidirecti  (None, 128)               84480     
 onal)                                                           
                                                                 
 dense_6 (Dense)             (None, 24)                3096      
                                                                 
 dense_7 (Dense)             (None, 1)                 25        
                                                                 
Total params: 387601 (1.48 MB)
Trainable params: 387601 (1.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
num_epochs = 5
history = model.fit(train_padded, train_labels,
                    epochs=num_epochs, verbose=1,
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
prediction = model.predict(test_padded)
# Get labels based on probability 1 if p>= 0.5 else 0
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

Accuracy of prediction on test set :  0.8372093023255814


In [None]:
# reviews on which we need to predict
sentence = ["Developing AI weapon target destroy specific individual group without due process unethical"
            ]
# convert to a sequence
sequences = tokenizer.texts_to_sequences(sentence)
# pad the sequence
padded = pad_sequences(sequences, padding='post', maxlen=max_length)
# Get labels based on probability 1 if p>= 0.5 else 0
prediction = model.predict(padded)
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
for i in range(len(sentence)):
    print(sentence[i])
    if pred_labels[i] == 1:
        s = 'Ethical'
    else:
        s = 'Unethical'
    print("Predicted sentiment : ",s)

Developing AI weapon target destroy specific individual group without due process unethical
Predicted sentiment :  Ethical
