In [1]:
from keras.layers import  Dropout, Dense
from keras.models import Sequential
from keras.utils import np_utils
from keras.models import load_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import re
from sklearn import metrics
import pickle
import tika
import glob
from tika import parser

Using TensorFlow backend.


In [2]:
root_path = "C:\\Users\\Alex.Chokwijitkul\\Desktop\\Document Classification\\data\\"

agenda_paths = [f for f in glob.glob(root_path + "Agendas/*", recursive=False)]
medicalrecord_paths = [f for f in glob.glob(root_path + "MedicalRecords/*", recursive=False)]
paper_paths = [f for f in glob.glob(root_path + "Papers/*", recursive=False)]
resume_paths = [f for f in glob.glob(root_path + "Resumes/*", recursive=False)]

In [3]:
def preprocess_text(text):
    processed = re.sub('[^a-zA-Z]', ' ', text)
    processed = re.sub(r"\s+[a-zA-Z]\s+", ' ', processed)
    processed = re.sub(r'\s+', ' ', processed)

    return processed

def process_raw_data(paths, label):
    data = {
        'Content': [],
        'Type': [label] * len(paths)
    }
    
    for path in paths:
        print('Processing {}'.format(path))
        parsed = parser.from_file(path)
        text = preprocess_text(parsed["content"])
        data['Content'].append(text)
    
    return pd.DataFrame(data, columns = ['Content', 'Type'])

In [None]:
agenda_df = process_raw_data(agenda_paths, 'agenda')

In [None]:
medicalrecord_df = process_raw_data(medicalrecord_paths, 'medicalrecord')

In [None]:
paper_df = process_raw_data(paper_paths, 'paper')

In [None]:
resume_df = process_raw_data(resume_paths, 'resume')

In [9]:
df = pd.concat([agenda_df, medicalrecord_df, paper_df, resume_df], axis=0)

In [17]:
def tfidf(X_train, X_test, num_words=5000):

    vectorizer_x = TfidfVectorizer(max_features=num_words)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()

    pickle.dump(vectorizer_x, open('vectoriser.pkl','wb'))
    print("tf-idf with", str(np.array(X_train).shape[1]), "features")

    return (X_train,X_test)

In [18]:
def build_DNN_model(shape, num_classes, dropout=0.2):

    model = Sequential()
    node = 512 # number of nodes
    num_layers = 4 # number of  hidden layer
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))

    for i in range(0, num_layers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))

    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [19]:
X = df['Content'].values
y = df['Type'].values

In [20]:
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
dummy_y = np_utils.to_categorical(encoded_y)

pickle.dump(encoder, open('encoder.pkl','wb'))

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size=0.20, random_state=7)

In [22]:
X_train_tfidf, X_test_tfidf = tfidf(X_train, X_test)

tf-idf with 5000 features


In [23]:
model = build_DNN_model(X_train_tfidf.shape[1], 4)

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               2560512   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 512)              

In [25]:
history = model.fit(X_train_tfidf, y_train, validation_split=0.2, epochs=5, batch_size=128, verbose=1)

Train on 1024 samples, validate on 256 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
score = model.evaluate(X_test_tfidf, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.006668820339473314
Test Accuracy: 0.996874988079071


In [27]:
model.save('model.h5')  # creates a HDF5 file 'model.h5'
del model  # deletes the existing model

In [28]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

def tfidf_all(X, num_words=5000):

    vectorizer_x = TfidfVectorizer(max_features=num_words)
    X = vectorizer_x.fit_transform(X).toarray()

    print("tf-idf with", str(np.array(X).shape[1]), "features")

    return X

def DNN_model():

    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer
    model.add(Dense(node,input_dim=5000,activation='relu'))
    model.add(Dropout(0.2))

    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(0.2))

    model.add(Dense(4, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [30]:
estimator = KerasClassifier(build_fn=DNN_model, epochs=5, batch_size=128, verbose=1)
kfold = KFold(n_splits=5, shuffle=True)
results = cross_val_score(estimator, tfidf_all(X), dummy_y, cv=kfold)

print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

tf-idf with 5000 features
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 99.62% (0.23%)


In [31]:
model = load_model('model.h5')
vectoriser = pickle.load(open('vectoriser.pkl', 'rb'))
encoder = pickle.load(open('encoder.pkl', 'rb'))

In [32]:
def preprocess_text(text):
    processed = re.sub('[^a-zA-Z]', ' ', text)
    processed = re.sub(r"\s+[a-zA-Z]\s+", ' ', processed)
    processed = re.sub(r'\s+', ' ', processed)

    return processed

def process_input_data(paths):
    data = []
    
    for path in paths:
        print('Processing {}'.format(path))
        parsed = parser.from_file(path)
        text = preprocess_text(parsed["content"])
        data.append(text)
    
    return data

In [45]:
papers = process_input_data([paper_paths[10]])

Processing C:\Users\Alex.Chokwijitkul\Desktop\Document Classification\data\Papers\1812.02993.pdf


In [46]:
vector = vectoriser.transform([papers[0]]).toarray()
vector

array([[0., 0., 0., ..., 0., 0., 0.]])

In [47]:
prediction = model.predict([vector])
prediction

array([[8.7061787e-13, 1.8738636e-22, 1.0000000e+00, 1.0185268e-28]],
      dtype=float32)

In [48]:
prediction = np.round(prediction[0])
prediction

array([0., 0., 1., 0.], dtype=float32)

In [49]:
prediction = encoder.inverse_transform(np.where(prediction == 1))
prediction[0]

'paper'