In [1]:
import os
import re
import nltk
from nltk.corpus import brown
import pandas as pd
import xml.etree.ElementTree as ET
from nltk.corpus.reader.bnc import BNCCorpusReader

In [3]:
import pandas as pd
import numpy as np

from keras import layers
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten, MaxPooling1D, Input, Concatenate
from keras.models import load_model

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt

In [4]:
fiction = ['adventure','fiction','mystery' , 'romance', 'science_fiction']
nonfiction = ['government','hobbies','learned','news', 'reviews'] 

In [5]:
fiction_ids = [x for y in fiction for x in brown.fileids(categories=y)]
nonfiction_ids = [x for y in nonfiction for x in brown.fileids(categories=y)]

In [6]:
data = []
for index, fileid in enumerate(fiction_ids+nonfiction_ids):
    paras = brown.paras(fileids=fileid)
    label = 1 if fileid in fiction_ids else 0
#     label = 'fiction' if fileid in fiction_ids else 'non_fiction'
    for j, p in enumerate(paras):
        if len(p) > 4 and len(p) < 7:
            text = ''
            for sent in p:
                text = text + ' '.join(sent) + ' '
            text = text.strip().lower()
            temp = {}
            temp['id'] = f'{fileid}_para_{j}'
            temp['para'] = text
            temp['label'] = label
            data.append(temp)
#     print('Finished', index) 

In [7]:
df_brown = pd.DataFrame.from_dict(data)

In [8]:
text_train = df_brown['para'].to_list()
y_train  = df_brown['label']

In [9]:
emmbed_dict = {}
with open('/home/mindbowser/MS/MS_SEM_9_Final/Brown_Corpus_Analysis/Analysis/Deep Learning model/glove.6B.100d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:],'float32')
        emmbed_dict[word]=vector
    f.close()

In [10]:
def emb (vocab_size, words_to_index):
    emb_matrix = np.zeros((vocab_size, 100))
    for word, index in words_to_index.items():
        embedding_vector =emmbed_dict.get(word)
        if embedding_vector is not None:
            emb_matrix[index, :] = embedding_vector
    return emb_matrix

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_train)
X_train = tokenizer.texts_to_sequences(text_train)

In [15]:
words_to_index = tokenizer.word_index
vocab_size = len(words_to_index) + 1  # Adding 1 because of reserved 0 index
maxlen = max(len(x) for x in X_train)

In [16]:
X_train = np.asfarray(pad_sequences(X_train, padding='post', maxlen=maxlen))
y_train = np.asfarray(y_train)

In [17]:
weight = emb(vocab_size, words_to_index)
embedding_dim = 100

In [32]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights = [weight]))
model.add(layers.Conv1D(100, 3, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
# Fit model
history = model.fit(X_train, y_train,
                        epochs=3,
                        verbose=True,
                        batch_size=10,)
#                         validation_data=(X_test, y_test))
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [33]:
print("Training Accuracy:  {:.4f}".format(train_accuracy))

Training Accuracy:  0.9993


In [34]:
# test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=True)
# test_accuracy

# Baby BNC

In [35]:
fict_ids = os.listdir(r"../../baby_bnc_corpus/Texts/fic/")
non_fict_ids = os.listdir(r"../../baby_bnc_corpus/Texts/aca/")

In [36]:
fic_data = []
for i, xmlfile in enumerate(fict_ids):
    tree = ET.parse(f"../../baby_bnc_corpus/Texts/fic/{xmlfile}")
    root = tree.getroot()
    paras = root.findall('.wtext/div/p')
    for j, p in enumerate(paras):
        sents = p.findall('s')
        if len(sents) > 4 and len(sents) < 7:
            sen_list = []
            for s in p.findall('s'):
                sen = ''.join([w.text.lower() for w in s if w.text]).strip()
                sen_list.append(sen)
            texts = ' '.join(sen_list)
            temp = {}
            temp['para'] = texts
            temp['id'] = f'{xmlfile}_para_{j}'
            temp['label'] = 1
            fic_data.append(temp)
#     print('Finished', i)

In [37]:
non_fic_data = []
for i, xmlfile in enumerate(non_fict_ids):
    tree = ET.parse(f"../../baby_bnc_corpus/Texts/aca/{xmlfile}")
    root = tree.getroot()
    paras = root.findall('.wtext/div/p')
    for j, p in enumerate(paras):
        sents = p.findall('s')
        if len(sents) > 4 and len(sents) < 7:
            sen_list = []
            for s in p.findall('s'):
                sen = ''.join([w.text.lower() for w in s if w.text]).strip()
                sen_list.append(sen)
            temp = {}
            temp['id'] = f'{xmlfile}_para_{j}'
            temp['para'] = ' '.join(sen_list)
            temp['label'] = 0
            non_fic_data.append(temp)
#     print('Finished', i)

In [38]:
data = fic_data + non_fic_data
df_baby = pd.DataFrame.from_dict(data)

In [39]:
df_non_fict = df_baby[df_baby.label == 0].reset_index()
df_non_fict.drop(['index'], axis=1, inplace=True)

In [40]:
dfs = {}
for i in range(10):
    df_fict = df_baby[df_baby.label == 1].sample(250, random_state=i).reset_index()
    df_fict.drop(['index'], axis=1, inplace=True)
    df_final = df_fict.append(df_non_fict, ignore_index=True)
    dfs[i] = df_final

In [41]:
X_predictions = {}
scores = []
reports = []
for i, test in dfs.items():
    #test text and labels
    text_test = test['para'].to_list()
    y_test  = test['label'].to_list()
    X_test = tokenizer.texts_to_sequences(text_test)
    
    X_test = np.asfarray(pad_sequences(X_test, padding='post', maxlen=maxlen))
    y_test = np.asfarray(y_test)
    
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=False)
    scores.append(test_accuracy)
    
    pred = model.predict(X_test)
    X_pred = np.asfarray([1 if x>0.5 else 0 for x in pred])
    report = classification_report(y_test, X_pred, output_dict=True)
    reports.append(report)
    
    X_predictions[i] = model.predict(X_test)

In [42]:
np.mean(scores) *100 , np.std(scores) *100

(96.93711996078491, 0.41021850939284377)

In [44]:
np.mean([x['1.0']['f1-score'] for x in reports]), np.std([x['1.0']['f1-score'] for x in reports])

(0.9689696564277064, 0.0042891988044478566)

In [45]:
np.mean([x['0.0']['f1-score'] for x in reports]), np.std([x['0.0']['f1-score'] for x in reports])

(0.9697614409026892, 0.003923716746956489)

In [43]:
scores

[0.9695740342140198,
 0.9675456285476685,
 0.9655172228813171,
 0.9716024398803711,
 0.9736308455467224,
 0.9716024398803711,
 0.9716024398803711,
 0.9614604711532593,
 0.9756592512130737,
 0.9655172228813171]