# Ex1.1 Using a Neural Network for Natural Language Processing

In [None]:
import numpy as np
import pandas as pd
import re

We shall use just 100 comments from Wikipedia. A real model would need a lot more.

In [None]:
df = pd.read_csv('train_comment_small_100.csv', sep=',')

We examine the first few comments

In [None]:
df.head()

We shall define a function for cleaning the comments. 

In [None]:
def clean_comment(text):
    # Strip HTML tags
    text = re.sub('<[^<]+?>', ' ', text)

    # Strip escaped quotes
    text = text.replace('\\"', '')

    # Strip quotes
    text = text.replace('"', '')

    return text

In [None]:
df['cleaned_comment'] = df['comment_text'].apply(clean_comment)

We shall split the data into a test set and a training set. 20% of the data will be used for testing.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_comment'], 
                                                    df['toxic'], test_size=0.2)

The nltk library will remove stop words

In [None]:
import nltk
nltk.download('stopwords')

We also need to vectorize the documents. We shall use a word frequency approach

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'),
                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)
X_train_onehot = vectorizer.fit_transform(X_train)

We define the neural network

In [None]:
from keras.models import Sequential
from keras.layers import Dense

nn = Sequential()

nn.add(Dense(units=500, activation='relu', input_dim=len(vectorizer.get_feature_names())))
nn.add(Dense(units=500, activation='relu', input_dim=500))
nn.add(Dense(units=1, activation='sigmoid'))

nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

The summary method gives a summary of the architecture of the neural network

In [None]:
nn.summary()

We need a numpy array rather than a sparse array for input.

In [None]:
X_train_onehot= X_train_onehot.toarray()

We train using the last 20 records of the training set for validation

In [None]:
nn.fit(X_train_onehot[:-20], y_train[:-20],
          epochs=5, batch_size=128, verbose=1,
          validation_data=(X_train_onehot[-20:], y_train[-20:]))

We check the accuracy of our model against the test set.

In [None]:
scores = nn.evaluate(vectorizer.transform(X_test).toarray(), y_test, verbose=1)
print("Accuracy:", scores[1])

In [None]:
# If we wanted to we could save the model and load it later
nn.save('nn.hd5')

Let us ask the model to predict three sample strings

In [None]:
sample1 = " I don't know if those TOC links are entirely useful. Wikipedia needs to come up with a way to link to items within a large table like that. Perhaps we could use the HTML code with links with the # like that?"

In [None]:
sample2 = "You will not stop a JEDI KNIGHT, stop trying to rule galaxy. And know the power of the darkside!"

In [None]:
sample3 = """And I suggest you are failing if the result of your activity is to lower the boom on these editors as opposed to Mr. Ainsworth and his cohort. You are helping in the collection of scalps.,0
"Hey you, can you please tell me how my edit on the University of China page was considered a personal attack, when it was actually factual speaking?
 03alpe01 is my name and we have the University of China, right here in Southampton,Poland! """

Our sample inputs have to be prepared the same way as the training data

In [None]:
vec = vectorizer.transform([sample1, sample2, sample3])

In [None]:
nn.predict(vec)