In [1]:
import re
import nltk
from nltk.corpus import brown
import pandas as pd

In [3]:
import pandas as pd
import numpy as np

from keras import layers
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten, MaxPooling1D, Input, Concatenate
from keras.models import load_model

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt

In [4]:
fiction = ['adventure','fiction','mystery' , 'romance', 'science_fiction']
nonfiction = ['government','hobbies','learned','news', 'reviews'] 

In [5]:
fiction_ids = [x for y in fiction for x in brown.fileids(categories=y)]
nonfiction_ids = [x for y in nonfiction for x in brown.fileids(categories=y)]

In [6]:
data = []
for index, fileid in enumerate(fiction_ids+nonfiction_ids):
    paras = brown.paras(fileids=fileid)
    label = 1 if fileid in fiction_ids else 0
#     label = 'fiction' if fileid in fiction_ids else 'non_fiction'
    for j, p in enumerate(paras):
        if len(p) > 4 and len(p) < 7:
            text = ''
            for sent in p:
                text = text + ' '.join(sent) + ' '
            text = text.strip().lower()
            temp = {}
            temp['id'] = f'{fileid}_para_{j}'
            temp['para'] = text
            temp['label'] = label
            data.append(temp)
#     print('Finished', index) 

In [7]:
df = pd.DataFrame.from_dict(data)

In [8]:
X = df['para'].to_list()
y  = df['label'].to_list()

In [9]:
emmbed_dict = {}
with open('../resources/glove.6B.100d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:],'float32')
        emmbed_dict[word]=vector
    f.close()

In [10]:
def emb (vocab_size, words_to_index):
    emb_matrix = np.zeros((vocab_size, 100))
    for word, index in words_to_index.items():
        embedding_vector =emmbed_dict.get(word)
        if embedding_vector is not None:
            emb_matrix[index, :] = embedding_vector
    return emb_matrix

In [21]:
train_scores = []
test_scores = []
vocab_sizes = []
reports = []
for i in range(10):
    # Split train & test
    text_train, text_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
    # Tokenize and transform to integer index
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_train)
    X_train = tokenizer.texts_to_sequences(text_train)
    X_test = tokenizer.texts_to_sequences(text_test)
    words_to_index = tokenizer.word_index
    vocab_size = len(words_to_index) + 1  # Adding 1 because of reserved 0 index
    vocab_sizes.append(vocab_size)
    maxlen = max(len(x) for x in X_train) 
    # Add pading to ensure all vectors have same dimensionality
    X_train = np.asfarray(pad_sequences(X_train, padding='post', maxlen=maxlen))
    X_test = np.asfarray(pad_sequences(X_test, padding='post', maxlen=maxlen))
    y_train = np.asfarray(y_train)
    y_test = np.asfarray(y_test)
    weight = emb(vocab_size, words_to_index)
    embedding_dim = 100
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights = [weight]))
    model.add(layers.Conv1D(100, 3, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    # Fit model
    history = model.fit(X_train, y_train,
                        epochs=3,
                        verbose=True,
                        validation_data=(X_test, y_test),
                        batch_size=10)
    train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=True)
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=False)
    train_scores.append(train_accuracy)
    test_scores.append(test_accuracy)
    
    pred = model.predict(X_test)
    X_pred = np.asfarray([1 if x>0.5 else 0 for x in pred])
    report = classification_report(y_test, X_pred, output_dict=True)
    reports.append(report)
#     print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [14]:
# training Accuracy
np.mean(train_scores)*100, np.std(train_scores)*100

(99.72563982009888, 0.2945011366921104)

In [13]:
#Testing Accuracy
np.mean(test_scores)*100, np.std(test_scores)*100

(93.6644583940506, 0.8083805374216849)

In [19]:
# F1 Score for fiction
np.mean([x['1.0']['f1-score'] for x in reports]), np.std([x['1.0']['f1-score'] for x in reports])

(0.9393975799218485, 0.008094618048528169)

In [20]:
# F1 score for non-fiction
np.mean([x['0.0']['f1-score'] for x in reports]), np.std([x['0.0']['f1-score'] for x in reports])

(0.9334320976468365, 0.008835571417982356)

In [17]:
reports[0]

{'0.0': {'precision': 0.9800995024875622,
  'recall': 0.9036697247706422,
  'f1-score': 0.9403341288782816,
  'support': 218},
 '1.0': {'precision': 0.9166666666666666,
  'recall': 0.9829787234042553,
  'f1-score': 0.9486652977412732,
  'support': 235},
 'accuracy': 0.9448123620309051,
 'macro avg': {'precision': 0.9483830845771144,
  'recall': 0.9433242240874488,
  'f1-score': 0.9444997133097774,
  'support': 453},
 'weighted avg': {'precision': 0.9471928437283781,
  'recall': 0.9448123620309051,
  'f1-score': 0.9446560376703412,
  'support': 453}}

## For visualization 

In [11]:
%load_ext tensorboard

In [12]:
import tensorboard
tensorboard.__version__

'2.8.0'

In [20]:
text_train, text_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
    # Tokenize and transform to integer index
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_train)

X_train = tokenizer.texts_to_sequences(text_train)
X_test = tokenizer.texts_to_sequences(text_test)
    
words_to_index = tokenizer.word_index
vocab_size = len(words_to_index) + 1  # Adding 1 because of reserved 0 index
# vocab_sizes.append(vocab_size)

maxlen = max(len(x) for x in X_train) 
    
    # Add pading to ensure all vectors have same dimensionality
X_train = np.asfarray(pad_sequences(X_train, padding='post', maxlen=maxlen))
X_test = np.asfarray(pad_sequences(X_test, padding='post', maxlen=maxlen))
y_train = np.asfarray(y_train)
y_test = np.asfarray(y_test)

weight = emb(vocab_size, words_to_index)
embedding_dim = 100



In [22]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights = [weight]))
model.add(layers.Conv1D(filters=100, kernel_size=3, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 292, 100)          1315900   
                                                                 
 conv1d_2 (Conv1D)           (None, 290, 100)          30100     
                                                                 
 global_max_pooling1d (Globa  (None, 100)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                1010      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 1,347,021
Trainable params: 1,347,021
Non-trainable params: 0
____________________________________________

In [23]:
from datetime import datetime
import keras
# Define the Keras TensorBoard callback.
logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

# Fit model
history = model.fit(X_train, y_train,
                    epochs=3,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10,
                    callbacks=[tensorboard_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [24]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 26020), started 0:30:18 ago. (Use '!kill 26020' to kill it.)

In [None]:
Doc2Vec : you can train your dataset using Doc2Vec and then use the sentence vectors.

In [None]:
Average of Word2Vec vectors : You can just take the average of all the word vectors in a sentence. This average vector will represent your sentence vector.

In [None]:
Average of Word2Vec vectors with TF-IDF : this is one of the best approach which I will recommend. Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector.

In [1]:
# # summarize history for loss
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper right')
# plt.show()

330