In [41]:
# Chapter5. Author Attribution. Solution 

################################################# STEP 1/ Prepare the data.#################################
import fileinput     
import glob 
import os
here = os.path.dirname(__file__) if "__file__" in locals() else "."
files = [("AuthorA", os.path.join(here, "./papers//A//*.txt")),
         ("AuthorB", os.path.join(here, "./papers//B//*.txt"))]

"""preprocess_text: Gets the contents of all .txt files from a folder and join them to a long string
--------------- Paras: --------------------------------------------------------------------------
@Author: The output file Name 
@DirName: Str indicates file location
--------------- Returns:-------------------------------------------------------------------------
@Text: a long tring representing the joined files contents 
"""
def preprocess_text(Author,DirName):
    read_files = glob.glob(DirName)
    # .read() method of a file handle 
    # reads the contents of a file and produces a long string
    with open(Author +".txt", "wb") as outfile:
        for file in read_files:
            with open(file, "rb") as infile:
                outfile.write(infile.read().lower())
        # Read the text with long string and lowercase it then export it 
        Tempfile = open(Author +".txt", 'rt').read().replace('hamilton','').replace('madison','')
        Text = ' '.join(Tempfile.replace("\n",' ').split()) # rmv newlines, whitespaces
        # write to file IF Need
        # print(Text,  file=open(Author +'.txt', 'w'))
        # print(Author + " text length is: {}".format(len(Text))) #print text length
    return Text

All_Text = {} # Dynamic variable in python 
for Author, DirName in files:
    All_Text[Author] = preprocess_text(Author, DirName)
    # print(Author + " text length is: {}".format(len(All_Text[Author]))) 
# # Access Dynamic variable in python 
# print("A text length is: {}".format(len(All_Text["AuthorA"]))) #print text length
# print("B text length is: {}".format(len(All_Text["AuthorB"]))) #print text length 

###################### STEP 2/Break the long text for each author into smaller sequences.####################
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import numpy as np
SEQ_LEN = 30 
# Classes for A/B/Unknown
A = 0
B = 1
UNKNOWN = -1

"""make_subsequences: Gets the contents of all .txt files from a folder and join them to a long string
--------------- Paras: --------------------------------------------------------------------------
@long_sequence:(str) The input text which is a long string joined from multiple .txt files 
@label: (str) Label name 
@sequence_length: the length of sequences 
--------------- Returns:-------------------------------------------------------------------------
@X,y: Matrixes for preparing datasets. 
-------------------------------------------------------------------------------------------------
"""
def make_subsequences(long_sequence, label, sequence_length = SEQ_LEN):
    len_seq = len(long_sequence)
    X = np.zeros(((len_seq - sequence_length)+1, sequence_length)) # Why not "/" but "-" 
    y = np.zeros((X.shape[0],1)) # This is for label <kept the X rows - X.shape[0]>
    for i in range(X.shape[0]): 
        X[i] = long_sequence[i:i+sequence_length] # put the sequence to X matrix
        y[i] = label # label the sequence
    return X,y

# Use the Tokenizer class from Keras to convert the long texts into a sequence of characters ? (looks like numbers) (not words)
tokenizer = Tokenizer(char_level= True)
# Make sure to fit all characters in texts from both authors
tokenizer.fit_on_texts(All_Text["AuthorA"] + All_Text["AuthorB"])

authorA_long_sequence = tokenizer.texts_to_sequences([All_Text["AuthorA"]])[0]
authorB_long_sequence = tokenizer.texts_to_sequences([All_Text["AuthorB"]])[0]

# Convert the long sequencese into sequence and label pairs
X_authorA, y_authorA = make_subsequences(authorA_long_sequence, A)
X_authorB, y_authorB = make_subsequences(authorB_long_sequence, B)

# Print sizes of available data
print("Number of characters: {}".format(len(tokenizer.word_index)))
print('author A sequences: {}'.format(X_authorA.shape))
print('author B sequences: {}'.format(X_authorB.shape))

# Calculate the number of unique words in the text 
# Compare the number of raw characters to the number of labeled sequences for each author

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts([All_Text["AuthorA"], All_Text["AuthorB"]])
print("Total word count: ", len((All_Text["AuthorA"] + ' ' + All_Text["AuthorB"]).split(' ')))
print ("Total number of unique words: ", len(word_tokenizer.word_index))


Number of characters: 52
author A sequences: (216441, 30)
author B sequences: (230897, 30)
Total word count:  74412
Total number of unique words:  6337


In [42]:
# How we can join and split word after removing all the whitespaces and newlines 
string = "         spacious \n      Hai          \n kkkk    \n Third   NOwWW  "
print(string)
" ".join(string.lower().replace("\n",'').replace("Hai",'').split())


spacious 
      Hai          
 kkkk    
 Third   NOwWW  


'spacious hai kkkk third nowww'

In [42]:
###################### STEP 3: proceed to create our train, validation sets.##################################
# 1.Stack x data together and y data together
# 2.use train_test_split to split the dataset into 80% training and 20% validation
# 3.Reshape the data to make sure that they are sequences of correct length

#1. Stacking
X = np.vstack((X_authorA, X_authorB))
y = np.vstack((y_authorA, y_authorB))

#2. Split
X_train, X_val, y_train, y_val = train_test_split(X,y, train_size =0.8)

#3. Ensure that the data is the same size as expected (batch_size, sequence length) (optional)
# -1, the value is inferred from the length of the array and remaining dimension. (10,10,2) = (5,5,8) = (5,5,-1)
X_train = X_train.reshape(-1, SEQ_LEN) 
X_val = X_val.reshape(-1, SEQ_LEN)

# Print the shapes of the train, validation and test sets
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_validate shape: {}".format(X_val.shape))
print("y_validate shape: {}".format(y_val.shape))

X_train shape: (357870, 30)
y_train shape: (357870, 1)
X_validate shape: (89468, 30)
y_validate shape: (89468, 1)


In [55]:
###################### STEP 4: Buid and Train RNN, then save RNN model ##################################
from keras.layers import SimpleRNN, Embedding, Dense
from keras.models import Sequential
from keras.optimizers import SGD, Adadelta, Adam
Embedding_size = 100
RNN_size = 156
Batch_size = 4096
Epochs = 20

RNN = Sequential()
RNN.add(Embedding(len(tokenizer.word_index) + 1,Embedding_size, input_length = SEQ_LEN))
RNN.add(SimpleRNN(RNN_size, return_sequences = False))
RNN.add(Dense(1,activation='sigmoid'))

RNN.compile(optimizer= 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
RNN.summary()

RNN.fit(X_train, y_train, batch_size= Batch_size, epochs= Epochs, validation_data=(X_val,y_val))
# Evaluate model 
scores = RNN.evaluate(X_val,y_val, verbose= 0)
RNN.save("RNN.h5")
print("Saved model RNN to disk")


Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 30, 100)           5300      
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 156)               40092     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 157       
Total params: 45,549
Trainable params: 45,549
Non-trainable params: 0
_________________________________________________________________
Train on 357870 samples, validate on 89468 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Saved model RNN to disk


In [57]:
#######################STEP 5:Applying the model to Unknown papers ###################################
for paper in  sorted(os.listdir('./papers/Unknown/')):
    unknown = preprocess_text('Unknown','./papers/Unknown/' + paper)
    unknown_long_sequences = tokenizer.texts_to_sequences([unknown])[0]
    X_sequences, _ = make_subsequences(unknown_long_sequences, UNKNOWN)
    X_sequences = X_sequences.reshape((-1,SEQ_LEN))

    votes_A = 0
    votes_B = 0

    y = RNN.predict(X_sequences)
    # >0.5, convert the sequence of 0 and 1
    y = y>0.5
    votes_A = np.sum(y==0)
    votes_B = np.sum(y==1)

    print("Paper {} is predicted to have been written by {}, {} to {}"
                                            .format(paper.replace('paper_','').replace('.txt',''),
                                            ("Author A" if votes_A > votes_B else "Author B"),
                                            max(votes_A,votes_B),
                                            min(votes_A,votes_B)))



Paper 1 is predicted to have been written by Author B, 12293 to 8484
Paper 2 is predicted to have been written by Author B, 11384 to 8265
Paper 3 is predicted to have been written by Author A, 6753 to 6634
Paper 4 is predicted to have been written by Author A, 5109 to 4667
Paper 5 is predicted to have been written by Author A, 6808 to 4949


In [58]:
#######################STEP 6:load and evaluate a saved model ###################################
from keras.models import load_model
# load model
Loaded_RNN = load_model('RNN.h5')
# summarize model.
Loaded_RNN.summary()
score = Loaded_RNN.evaluate(X_val, y_val, verbose=0)
print("%s: %.2f%%" % (Loaded_RNN.metrics_names[1], score[1]*100))

 ## Applying the loaded model to Unknown papers 
for paper in  sorted(os.listdir('./papers/Unknown/')):
    unknown = preprocess_text('Unknown','./papers/Unknown/' + paper)
    unknown_long_sequences = tokenizer.texts_to_sequences([unknown])[0]
    X_sequences, _ = make_subsequences(unknown_long_sequences, UNKNOWN)
    X_sequences = X_sequences.reshape((-1,SEQ_LEN))

    votes_A = 0
    votes_B = 0

    y = Loaded_RNN.predict(X_sequences)
    # >0.5, convert the sequence of 0 and 1
    y = y>0.5
    votes_A = np.sum(y==0)
    votes_B = np.sum(y==1)

    print("[Loaded_RNN] Paper {} is predicted to have been written by {}, {} to {}"
                                            .format(paper.replace('paper_','').replace('.txt',''),
                                            ("Author A" if votes_A > votes_B else "Author B"),
                                            max(votes_A,votes_B),
                                            min(votes_A,votes_B)))

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 30, 100)           5300      
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 156)               40092     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 157       
Total params: 45,549
Trainable params: 45,549
Non-trainable params: 0
_________________________________________________________________
accuracy: 86.65%
[Loaded_RNN] Paper 1 is predicted to have been written by Author B, 12293 to 8484
[Loaded_RNN] Paper 2 is predicted to have been written by Author B, 11384 to 8265
[Loaded_RNN] Paper 3 is predicted to have been written by Author A, 6753 to 6634
[Loaded_RNN] Paper 4 is predicted to have been written by Author A, 5109 to 4667
[Loaded_RNN] Paper 5 is predicted to 