## Import Related Libraries for Text Loading and Processing

In [1]:
import numpy as np
import nltk
import glob
import os
import operator
import textwrap
from sklearn.feature_extraction.text import CountVectorizer
from scipy.cluster.vq import whiten

## Load training data
### 10 English books with labels from 0~9 (can be found in the folders)
###  Book Labels, Book Names and Authors

| Label | Book | Author |
| ---- | ---- | ---- |
| 0 | A Tale of Two Cities | Charles Dickens |
| 1 | Meditations | Marcus Aurelius |
| 2 | Dracula | Bram Stoker |
| 3 | Grimms' Fairy Tales | Grimm brothers |
| 4 | The Practice and Science of Drawing | Harold Speed |
| 5 | Pride and Prejudice | Jane Austen |
| 6 | Beyond Good and Evil | Friedrich Nietzsche |
| 7 | Dubliners | James Joyce |
| 8 | The Souls of Black Folk | W. E. B. Du Bois |
| 9 | The Picture of Dorian Gray | Oscar Wilde |

In [2]:
##  Create a dictionary for the final prediction
Book_dict={0:"Charles Dickens", 1:"Marcus Aurelius", 2:"Bram Stoker",
           3:"Grimm brothers", 4:"Harold Speed", 5:"Jane Austen",
           6:"Friedrich Nietzsche", 7:"James Joyce", 8:"W. E. B. Du Bois", 9:"Oscar Wilde"}

### From these 10 books, here I pick every 2000 strings as one sample, attached with the corresponding label.

In [3]:
# Create a list of sample labels representing the authorships
sample_labels=[] 
# Create a list of samples
samples = []
# Load data from the folder
files = glob.glob(os.path.join("Data", "*.txt"))
for fn in files:
    with open(fn, encoding="utf8") as f:
            print(fn)
            for segment in textwrap.wrap(f.read().replace('\n',' '),2000):
                samples.append(segment)
                sample_labels.append(int(fn.split('$')[1]))
all_text = ''.join(str(samples))
num_samples = len(samples) # 2370 samples in total

Data\$0$ A_Tale_of_Two_Cities.txt
Data\$1$ Meditations.txt
Data\$2$ Dracula.txt
Data\$3$ Grimms'_Fairy_Tales.txt
Data\$4$ The_Practice_and_Science_of_Drawing.txt
Data\$5$ Pride_and_prejudice.txt
Data\$6$ Beyond_Good_And_Evil.txt
Data\$7$ Dubliners.txt
Data\$8$ The_Souls_of_Black_Folk.txt
Data\$9$ The_Picture_of_Dorian_Gray.txt


## Feature Extraction
## Based on my literature survey and online resource seaching, here we define 3 functions to extract different features from the text segments:

## 1. Lexical and Punctuation features
- ### Lexical features:
    - #### The average number of words per sentence
    - #### Sentence length variation
    - #### Lexical diversity, which is a measure of the richness of the author’s vocabulary
- ### Punctuation features:
    - #### Average number of commas, semicolons and colons etc. per sentence

In [4]:
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

def Lexical_Punctuation(data):
    # Creatte feature vector           
    fvs_lexical = np.zeros((len(data),3), np.float64)
    fvs_punct = np.zeros((len(data),5), np.float64)
    for e, ch_text in enumerate(data):
        # note: the nltk.word_tokenize includes punctuation
        tokens = nltk.word_tokenize(ch_text.lower())
        words = word_tokenizer.tokenize(ch_text.lower()) # words without punctuation
        sentences = sentence_tokenizer.tokenize(ch_text)
        vocab = set(words)
        words_per_sentence = np.array([len(word_tokenizer.tokenize(s))
                                       for s in sentences])

        # average number of words per sentence
        fvs_lexical[e, 0] = words_per_sentence.mean()
        # sentence length variation
        fvs_lexical[e, 1] = words_per_sentence.std()
        # Lexical diversity
        fvs_lexical[e, 2] = len(vocab) / float(len(words))

        # Commas per sentence
        fvs_punct[e, 0] = tokens.count(',') / float(len(sentences))
        # Semicolons per sentence
        fvs_punct[e, 1] = tokens.count(';') / float(len(sentences))
        # Colons per sentence
        fvs_punct[e, 2] = tokens.count(':') / float(len(sentences))
        # Question marks per sentence
        fvs_punct[e, 3] = tokens.count('?') / float(len(sentences))
        # Question marks per sentence
        fvs_punct[e, 4] = tokens.count('"') / float(len(sentences))
    # apply whitening to decorrelate the features
    fvs_lexical = whiten(fvs_lexical)
    fvs_punct = whiten(fvs_punct)
    
    return fvs_lexical, fvs_punct
    

## 2. Bag of Words features
###   Bag of words represents the frequencies of different words in each chapter

In [5]:
# get most common words in all books
NUM_TOP_WORDS = 10
all_tokens = nltk.word_tokenize(all_text)
fdist = nltk.FreqDist(all_tokens)
vocab = sorted(fdist.items(), key=operator.itemgetter(1),reverse=True) 
vocab = list(dict(vocab).keys())[:NUM_TOP_WORDS]

In [6]:
# use sklearn to create the bag for words feature vector for each chapter
vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=nltk.word_tokenize)

def Bag_of_words(data):
    fvs_bow = vectorizer.fit_transform(data).toarray().astype(np.float64)
    # normalise by dividing each row by its Euclidean norm
    fvs_bow /= np.c_[np.apply_along_axis(np.linalg.norm, 1, fvs_bow)]
    return fvs_bow

## 3. Synthetic Features
###   For the last feature, here I extract syntactic features of the text. Part of speech (POS) is a classification of each token into a lexical category (e.g. noun). NLTK has a function for POS labeling, and our feature vector is comprised of frequencies for the most common POS tags:

In [7]:
# get part of speech for each token in each chapter
def token_to_pos(ch):
    tokens = nltk.word_tokenize(ch)
    return [p[1] for p in nltk.pos_tag(tokens)]

def Synthetic_features(data):
    data_pos = [token_to_pos(ch) for ch in data]

    # count frequencies for common POS types
    pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
    fvs_syntax = np.array([[ch.count(pos) for pos in pos_list]
                           for ch in data_pos]).astype(np.float64)

    # normalise by dividing each row by number of tokens in the books
    fvs_syntax /= np.c_[np.array([len(ch) for ch in data_pos])]
    
    return fvs_syntax

## Supervised Learning: MLP/SVM/KNN/RandomForest based classification

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
##   Build a classifier and then vote for 
def Author_Predictor_SVM(fvs_train, fvs_test, y_train):
    svcm = SVC()
    svcm.fit(fvs_train,y_train)
    return svcm.predict(fvs_test)

def Author_Predictor_MLP(fvs_train, fvs_test, y_train):
    MLP = MLPClassifier(hidden_layer_sizes=(150, ),max_iter=200)
    MLP.fit(fvs_train,y_train)
    return MLP.predict(fvs_test)

def Author_Predictor_RF(fvs_train, fvs_test, y_train):
    rf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=15)
    rf.fit(fvs_train,y_train)
    return rf.predict(fvs_test)

def Author_Predictor_KNN(fvs_train, fvs_test, y_train):
    KNN = KNeighborsClassifier(20)
    KNN.fit(fvs_train,y_train)
    return KNN.predict(fvs_test)

## Training and Testing data split Using 5-fold Cross Validation here
### 5 different splits of data set, in each split :80% for training, remaining 20% for testing

In [10]:
samples = np.array(samples)
sample_labels = np.array(sample_labels)

from sklearn.model_selection import train_test_split, StratifiedKFold
accuracy_result = []
classification_report_result = []
confusion_matrix_result = []
target_names = [Book_dict[i] for i in range(10)]
skf = StratifiedKFold(n_splits=5)
fold_n = 0
for train_index, test_index in skf.split(samples, sample_labels):
    X_train, X_test = samples[train_index],samples[test_index]
    y_train, y_test = sample_labels[train_index], sample_labels[test_index]
    
    # Here we concatenate all these four different feature vectors together to get the final feature vectors
    # Feature vectors for training
    train_fvs = np.hstack((Lexical_Punctuation(X_train)[0],Lexical_Punctuation(X_train)[1], Bag_of_words(X_train), Synthetic_features(X_train)))
    # Feature vectors for testing
    test_fvs = np.hstack((Lexical_Punctuation(X_test)[0],Lexical_Punctuation(X_test)[1], Bag_of_words(X_test), Synthetic_features(X_test)))
    
    # Show the 5 fold validation results of our model
    fold_n += 1
    print("***************************"+"Result for Fold "+str(fold_n)+"***************************************")
    acc = accuracy_score(y_test, Author_Predictor_MLP(train_fvs, test_fvs, y_train))
    print("The test accuracy for Fold"+str(fold_n)+"is", acc)
    accuracy_result.append(acc)
    cla_report = classification_report(y_test, Author_Predictor_MLP(train_fvs, test_fvs, y_train), target_names = target_names )
    print(cla_report)
    classification_report_result.append(cla_report)
    con_matrix = confusion_matrix(y_test, Author_Predictor_MLP(train_fvs, test_fvs, y_train))
    print(con_matrix)
    confusion_matrix_result.append(con_matrix)
    print("***************************"+"Result for Fold "+str(fold_n)+"***************************************")
    print(' ')



***************************Result for Fold2***************************************
The test accuracy for Fold2is 0.572992700729927
                     precision    recall  f1-score   support

    Charles Dickens       0.39      0.73      0.51        78
    Marcus Aurelius       1.00      0.19      0.32        42
        Bram Stoker       0.70      0.22      0.33        87
     Grimm brothers       0.92      0.84      0.88        55
       Harold Speed       0.74      0.61      0.67        46
        Jane Austen       0.67      0.54      0.59        71
Friedrich Nietzsche       0.40      0.78      0.52        41
        James Joyce       0.86      0.49      0.62        39
   W. E. B. Du Bois       0.43      0.72      0.54        43
        Oscar Wilde       0.81      0.83      0.82        46

        avg / total       0.68      0.58      0.57       548

[[64  0  0  1  1  1  7  0  3  1]
 [ 5  8  1  0  0  6 14  0  7  1]
 [43  0 15  0  2  6  1  0 17  3]
 [ 7  0  0 48  0  0  0  0  0  0]
 [



[[55  0  0  3  1 15  1  0  2  1]
 [ 8 26  1  2  0  1  3  0  1  0]
 [17  0 41  5  1 16  0  3  2  2]
 [ 4  0  0 50  0  0  0  0  0  0]
 [ 0  0  0  2 40  1  1  2  0  0]
 [ 2  0  0  0  0 68  1  0  0  0]
 [ 1  3  0  0  0  4 28  2  1  2]
 [ 2  0  2  1  1  0  0 33  0  0]
 [10  0  0  0  0  4  8  0 20  0]
 [ 2  0  0  0  0  1  0  1  1 41]]
***************************Result for Fold3***************************************
***************************Result for Fold4***************************************
The test accuracy for Fold4is 0.7371323529411765
                     precision    recall  f1-score   support

    Charles Dickens       0.55      0.85      0.67        78
    Marcus Aurelius       0.92      0.83      0.88        42
        Bram Stoker       0.76      0.75      0.75        87
     Grimm brothers       0.90      0.98      0.94        54
       Harold Speed       0.64      0.91      0.75        46
        Jane Austen       0.89      0.56      0.69        71
Friedrich Nietzsche       

In [11]:
accuracy_result

[0.572992700729927,
 0.7271062271062271,
 0.7371323529411765,
 0.7527675276752768,
 0.49074074074074076]

In [12]:
avg_accuracy = np.mean(accuracy_result)
print("The average accuracy of the test data is:", avg_accuracy)

The average accuracy of the test data is: 0.6561479098386697


In [None]:
print(confusion_matrix(y_test, Author_Predictor_MLP(train_fvs, test_fvs)))

In [None]:
target_names = [Book_dict[i] for i in range(10)]

print(classification_report(y_test, Author_Predictor_MLP(train_fvs, test_fvs), target_names = target_names ))

## Unsupervised Learning: KMeans Clustering 

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=10, init='k-means++', n_init=10, verbose=0)
km.fit(fvs)
