In [8]:
import os
import csv
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)
from tensorflow import keras
tf.load_library("/etc/alternatives/libcudnn_so")
import matplotlib.pyplot as plt

import re
import numpy as np
import utils

# import ner_module
# from ner_module import *

In [34]:
MAX_LEN = 64
class ImportData():
    def __init__(self, **kwargs):
        super().__init__()
        
        for key, value in kwargs.items():
            setattr(self, key, value)        
            
    def export_to_file(self,export_file_path, data):
        export_file_path_features = export_file_path + "_features.txt"
        export_file_path_labels = export_file_path + "_labels.txt"
        with open(export_file_path_features, "w") as f:
            for record in data:
                ner_tags = record["ner_tags"]
                tokens = record["tokens"]
                if len(tokens) > 0:
                    binary_tags = self.per_tags(ner_tags)
                    f.write(
                          "\t".join(tokens)
                        + "\n"
                    )

        with open(export_file_path_labels, "w") as f:
            for record in data:
                ner_tags = record["ner_tags"]
                tokens = record["tokens"]
                if len(tokens) > 0:
                    binary_tags = self.per_tags(ner_tags)
                    f.write(
                          "\t".join(map(str, binary_tags))
                        + "\n"
                    )

    def per_tags(self,nertags):
        return [1 if x in (1,2) else 0 for x in nertags]

    def write_vocab_to_file(self,file_path,vocab):
        if not os.path.exists(file_path):
            with open(file_path, 'w') as file:
                file.write("This is a new file.")
            print(f"File {file_path} created successfully.")
        else:
            print(f"File {file_path} exists. Overwriting existing file.")

        with open(file_path, 'w', encoding='utf-8') as file:
            for item in vocab:
                file.write(item+"\n")

    def write_training_and_validation_data_to_file(self,training,validation):
        if os.path.exists("./data"):
            self.export_to_file("./data/train", training)
            self.export_to_file("./data/val", validation)
            print("training and validation data saved to ",os.path.join(current_directory,'data'))
        else:
            print("Creating directory for training and validation data: ",os.path.join(current_directory,'data'))
            os.mkdir("data")
            self.export_to_file("./data/train", training)
            self.export_to_file("./data/val", validation)
            print("training and validation data saved to ",os.path.join(current_directory,'data'))

    def import_data(self):
        if self.source == "conll":
            conll_data = load_dataset("conll2003")
            print("Importing conll data")
            self.write_training_and_validation_data_to_file(conll_data["train"],conll_data["validation"])
            
            with open('./data/train_features.txt', 'r') as f:
                train_features = f.read()
                
            vocabulary = set()
            for line in train_features:
                split_line = line.split('\t')
                for item in split_line:
                    vocabulary.update(item)
            vocabulary = sorted(vocabulary)
            with open('./data/vocabulary.txt', 'w') as vocab_file:
                for word in vocabulary:
                    vocab_file.write(word + '\n')
                
                
            with open('./data/train_labels.txt', 'r') as f:
                train_labels = f.read()                  
                
            with open('./data/val_features.txt', 'r') as f:
                val_features = f.read()
            with open('./data/val_labels.txt', 'r') as f:
                val_labels = f.read()                  
                
            train_features = train_features[:100]
            train_labels = train_labels[:100]
        return train_features, train_labels, val_features, val_labels, vocabulary


class PreProcessData():
    def __init__(self, vocab_size=20000,source='conll',**kwargs):    
        self.vocab_size=vocab_size
        self.source=source
        # self.lookup_layer = tf.keras.layers.StringLookup(vocabulary=vocabulary)
        # self.vocabulary = None    
    
   
    def preprocess(self, features, labels):
        all_input_ids = []
        all_attention_mask = []
        all_labels = []
        data = {'features': features, 'labels': labels}
        tokens = []
        seen_tokens = set()        
        data = list(zip(features.split('\n'), labels.split('\n')))
        for line in data:
            combine_line = list(zip(line[0].split('\t'), line[1].split('\t')))
            for item in combine_line:
                token_str = item[0].lower()
                if token_str not in seen_tokens and item[1] != '':
                    print(item)
                    token = tokenizer(token_str, padding="max_length", truncation=True, max_length=MAX_LEN)
                    print("TOKEN: ",token)
                    label_matrix = np.zeros(2)
                    label_matrix[int(item[1])] = 1
                    print("LABEL MATRIX: ", label_matrix)
                    all_input_ids.append(token['input_ids'])
                    all_attention_mask.append(token['attention_mask'])
                    all_labels.append(label_matrix.tolist())
                    seen_tokens.add(token_str)
        target_len = len(data)
        
        print("ALL LABELS: ",all_labels)
        print("ALL INPUT IDS: ",all_input_ids)        
        
        all_input_ids = all_input_ids[:target_len] + [[0] * MAX_LEN] * (target_len - len(all_input_ids))
        all_attention_mask = all_attention_mask[:target_len] + [[0] * MAX_LEN] * (target_len - len(all_attention_mask))
        all_labels = all_labels[:target_len] + [[0, 0]] * (target_len - len(all_labels))
        



        return {"input_ids": all_input_ids[0], "attention_mask": all_attention_mask[0], "labels": all_labels[0]}
    

In [3]:
# test = self.map_record_to_training_data(train_data)
# print(test)

In [4]:
cluster_spec = {
    "worker": ["worker1:port", "worker2:port"],
    "chief": ["chief1:port"]
}
cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver(cluster_spec)
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
    communication=tf.distribute.experimental.CollectiveCommunication.AUTO,
    cluster_resolver=cluster_resolver
)
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 2


In [35]:
ner_data_importer = ImportData(source="conll", vocab_size=100000)
train_features, train_labels, val_features, val_labels, vocabulary = ner_data_importer.import_data()

Importing conll data
training and validation data saved to  /media/daniel/HDD1/AI574/Project/data


In [36]:
ner_data_preprocess = PreProcessData()
train_dataset = ner_data_preprocess.preprocess(train_features, train_labels)
# validation_dataset = ner_data_preprocess.preprocess(val_features, val_labels)

('EU', '0')
TOKEN:  {'input_ids': [101, 7327, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
LABEL MATRIX:  [1. 0.]
('rejects', '0')
TOKEN:  {'input_ids': [101, 19164, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
def preprocess(self, features, labels):
    all_input_ids = []
    all_attention_mask = []
    all_labels = []
    data = {'features': features, 'labels': labels}
    tokens = []
    seen_tokens = set()        
    data = list(zip(features.split('\n'), labels.split('\n')))
    for line in data:
        combine_line = list(zip(line[0].split('\t'), line[1].split('\t')))
        for item in combine_line:
            token_str = item[0].lower()
            if token_str not in seen_tokens and item[1] != '':
                token = tokenizer(token_str, padding="max_length", truncation=True, max_length=MAX_LEN)
                label_matrix = np.zeros(2)
                label_matrix[int(item[1])] = 1
                all_input_ids.append(token['input_ids'])
                all_attention_mask.append(token['attention_mask'])
                all_labels.append(label_matrix.tolist())
                seen_tokens.add(token_str)
    target_len = len(data)
    all_input_ids = all_input_ids[:target_len] + [[0] * MAX_LEN] * (target_len - len(all_input_ids))
    all_attention_mask = all_attention_mask[:target_len] + [[0] * MAX_LEN] * (target_len - len(all_attention_mask))
    all_labels = all_labels[:target_len] + [[0, 0]] * (target_len - len(all_labels))

    print(all_input_ids)
    print(all_labels)

In [None]:
# I think this is part of the solution:

# X_train = list(zip(train_dataset["input_ids"], train_dataset["attention_mask"]))
# y_train = [np.argmax(label) for label in train_dataset["labels"]]
# X_train_array = np.array(X_train)
# X_train_flat = X_train_array.reshape(X_train_array.shape[0], -1)

In [7]:
train_dataset_tf = tf.data.Dataset.from_tensor_slices((train_dataset["input_ids"],train_dataset["labels"]))
validation_dataset_tf = tf.data.Dataset.from_tensor_slices((validation_dataset["input_ids"],validation_dataset["labels"]))

ValueError: Dimensions 512 and 2 are not compatible

In [None]:
train_dataset_shuf = train_dataset_tf.shuffle(512).batch(64).prefetch(tf.data.AUTOTUNE)
validation_dataset_shuf = validation_dataset_tf.shuffle(512).batch(64).prefetch(tf.data.AUTOTUNE)

In [None]:
# batch_size = 32
# train_dataset = (
#     train_data.map(self.map_record_to_training_data)
#     .map(lambda x, y: (self.lowercase_and_convert_to_ids(x), y))
#     .padded_batch(batch_size)
# )
# val_dataset = (
#     val_data.map(self.map_record_to_training_data)
#     .map(lambda x, y: (self.lowercase_and_convert_to_ids(x), y))
#     .padded_batch(batch_size)
# )            

In [None]:
train = "Y"
if train == "Y":
    train_model = TrainModel()
    train_model.build_model()
    train_model.compile_model()
    history = train_model.train(train_data=train_dataset_shuf,
                                val_data = validation_dataset_shuf,
                                epochs=500, batch_size=32)
    train_model.save_model('NER_saved_recent')

    plt.figure(figsize=(14, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')

    plt.tight_layout()
    plt.show()

# Everything below this applies tags to the original text using the model

In [None]:
# ner_model = tf.keras.models.load_model('NER_saved_recent', custom_objects={'CustomNonPaddingTokenLoss': CustomNonPaddingTokenLoss})
# def lowercase_and_convert_to_ids(tokens):
#     tokens = tf.strings.lower(tokens)
#     return lookup_layer(tokens)
# def tokenize_and_convert_to_ids(text):
#     tokens = text.split()
#     return lowercase_and_convert_to_ids(tokens)

In [None]:
# def lowercase_and_convert_to_ids(tokens):
#     tokens = tf.strings.lower(tokens)
#     return lookup_layer(tokens)
# def tokenize_and_convert_to_ids(text):
#     tokens = text.split()
#     return lowercase_and_convert_to_ids(tokens)

# sample_input = tokenize_and_convert_to_ids(
#     "eu rejects german call to boycott british lamb from Steve parson the funky Parson"
# )
# sample_input = tf.reshape(sample_input, shape=[1, -1])
# print(sample_input)

# output = ner_model.predict(sample_input)
# prediction = np.argmax(output, axis=-1)[0]
# prediction = [MAPPING[i] for i in prediction]
# print(prediction)

In [None]:
from itertools import groupby
from operator import itemgetter
def group_consecutive_words(text_w_word_indicators):
    grouped_words = []
    for key, group in groupby(text_w_word_indicators, key=itemgetter(1)):
        words = [word for word, _ in group]
        grouped_words.append((words, key))
    return grouped_words    

def per_tree(text):
    tree_str = ""
    for group in text:
        words, indicator = group
        if indicator == 'B-PER': 
            quoted_group = ' '.join([f"{word}" for word in words])
            tree_str += f"(PER {quoted_group})"
        else:
            tree_str += ' '.join(words) + " "       
    tree_str = tree_str.strip()
    return tree_str

In [None]:
# from concurrent.futures import ThreadPoolExecutor

# class QuotationIndicator():
#     def __init__(self, num_threads=4, **kwargs):
#         self.pattern = re.compile(r'(".*?")')
#         self.num_threads = num_threads
#         for key, value in kwargs.items():
#             setattr(self, key, value) 

#     def process_chunk(self, chunk):
#         matches = [(m.start(1), m.end(1)) for m in self.pattern.finditer(chunk)]
#         words = chunk.split()
#         indicators = [0] * len(words)
#         word_start = 0
#         for i, word in enumerate(words):
#             word_end = word_start + len(word)
#             if any(start + 1 <= word_start < end - 1 for start, end in matches):
#                 indicators[i] = 1
#             word_start = word_end + 1
#         return indicators
    
#     def combine_text_and_indicators(self, text, indicators):
#         text = text.split()
#         word_indicators = list(zip(text, indicators))
#         print(word_indicators)
#         return word_indicators
    
#     def group_consecutive_words(self, text_w_word_indicators):
#         grouped_words = []
#         for key, group in groupby(text_w_word_indicators, key=itemgetter(1)):
#             words = [word for word, _ in group]
#             grouped_words.append((words, key))
#         return grouped_words    
    
#     def indicators_for_sentence(self, text):
#         text = utils.consolidate_double_quotes(text)
#         words = text.split()
#         num_threads = self.num_threads
#         chunk_size = len(words) // num_threads
#         chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
#         indicators = [0] * len(words)
#         with ThreadPoolExecutor(max_workers=num_threads) as executor:
#             results = list(executor.map(self.process_chunk, chunks))
#         flat_indicators = [indicator for sublist in results for indicator in sublist]
#         quotes_at_word_level = self.combine_text_and_indicators(text,flat_indicators[:len(words)])
#         quotes_at_sentence_level = self.group_consecutive_words(quotes_at_word_level)
#         return quotes_at_sentence_level
        
#     def quote_tree(self, fname, text):
#         tree_str = ""
#         # previous_indicator = 0
#         for group in self.indicators_for_sentence(text):
#             words, indicator = group
#             if indicator == 1: 
#                 quoted_group = ' '.join([f"{word}" for word in words])
#                 tree_str += f"(QUOTATION {quoted_group})"
#             else:
#                 tree_str += ' '.join(words) + " "       
#         tree_str = "(" + fname + " " + tree_str.strip() + ")"
#         return tree_str
    
    

In [None]:
# quote_indicator = QuotationIndicator(num_threads=2)

In [None]:
# text = 'insult. "What were you doing behind the curtain?" he asked. "I was reading." "Show the book." I returned to the window and fetched it thence. "You have no business"'
# test = quote_indicator.quote_tree('TEST', text)
# test 

In [None]:
# text = 'I never had an idea of replying to it; my care was how to endure the blow which would certainly follow the insult. "What were you doing behind the curtain?" he asked. "I was reading." "Show the book." I returned to the window and fetched it thence. "You have no business to take our books; you are a dependent, mama says; you have no money; your father left you none; you ought to beg, and not to live here with gentlemen’s children like us, and eat the same meals we do, and wear clothes at our mama’s expense. Now, I’ll teach you to rummage my bookshelves: for they _are_ mine; all the house belongs to me, or will do in a few years. Go and stand by the door, out of the way of the mirror and the windows." I did so, not at first aware what was his intention; but when I saw him lift and poise the book and stand in act to hurl it, I instinctively started aside with a cry of alarm: not soon enough, however; the volume was flung, it hit me, and I fell, striking my head against the door and cutting it. The cut bled, the pain was sharp: my terror had passed its climax; other feelings succeeded. "Wicked and cruel boy!" I said.'
# test = quote_indicator.quote_tree('TEST', text)
# test 

In [None]:
# desired = '(TEST I never had an idea of replying to it; my care was how to endure the blow which would certainly follow the insult. (QUOTATION "What were you doing behind the curtain?") he asked. (QUOTATION "I was reading.") (QUOTATION "Show the book.") I returned to the window and fetched it thence. (QUOTATION "You have no business to take our books; you are a dependent, mama says; you have no money; your father left you none; you ought to beg, and not to live here with gentlemen’s children like us, and eat the same meals we do, and wear clothes at our mama’s expense. Now, I’ll teach you to rummage my bookshelves: for they _are_ mine; all the house belongs to me, or will do in a few years. Go and stand by the door, out of the way of the mirror and the windows.") I did so, not at first aware what was his intention; but when I saw him lift and poise the book and stand in act to hurl it, I instinctively started aside with a cry of alarm: not soon enough, however; the volume was flung, it hit me, and I fell, striking my head against the door and cutting it. The cut bled, the pain was sharp: my terror had passed its climax; other feelings succeeded. (QUOTATION "Wicked and cruel boy!")I said.)'
# desired

In [None]:
import importlib
importlib.reload(utils)
quote_indicator = utils.QuotationIndicator(num_threads=1)
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
data_dir = '/media/daniel/HDD1/AI574/gutenberg/data/raw'
selected_works_path = '/media/daniel/HDD1/AI574/Project/selected_works_dive.csv'
line_chunk_size = 600
txt_all = ''
problem_files = []
with open(selected_works_path, mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for i, row in enumerate(csv_reader):
        if i % 500 == 1:
            print(i)
        target_path = os.path.join('/media/daniel/HDD1/AI574/gutenberg/data/raw',row[0]+'_raw.txt')
        lines_as_strings = []
        try:
            with open(target_path, mode='r', newline='') as file:
                lines = file.readlines()
                for j in range(0, len(lines), line_chunk_size):
                    chunk_lines = lines[j:j+line_chunk_size]
                    chunk_as_string = ' '.join(chunk_lines)
                    
                    chunk_as_string = chunk_as_string.replace('(','<')
                    chunk_as_string = chunk_as_string.replace(')','>')                      
                    
                    chunk_list = chunk_as_string.split()
                    chunk_as_string_clean = utils.remove_special_characters(chunk_as_string).lower()
                    chunk_for_ner = tokenize_and_convert_to_ids((chunk_as_string))
                    chunk_for_ner = tf.reshape(chunk_for_ner, shape=[1, -1])
                    
                    ner_output = ner_model.predict(chunk_for_ner, verbose=0)
                    ner_prediction = np.argmax(ner_output, axis=-1)[0]
                    ner_prediction_desc = [MAPPING[i] for i in ner_prediction]
                    
#                     text_nlp = nlp(chunk_as_string)
#                     ner_prediction_desc = [(word.ent_type_ if word.ent_type_ == "PERSON" else "0") for word in text_nlp]                    
#                     ner_prediction_desc = ["B-PER" if ent_type == "PERSON" else ent_type for ent_type in ner_prediction_desc]
                    
                    list_words_w_ner_indicators = list(zip(chunk_list, ner_prediction_desc))
                    str_words_w_ner_indicators = group_consecutive_words(list_words_w_ner_indicators)
                  
                    rejoined_words = per_tree(str_words_w_ner_indicators)
                    lines_as_strings.append(rejoined_words)
            file_lines = ' '.join(lines_as_strings)

            text_w_quotation_indicator = quote_indicator.quote_tree(row[0], file_lines)
            txt_all += text_w_quotation_indicator
        except Exception as e:
            problem_files.append(target_path)
            print(f"Error processing {target_path}: {e}")

print(f"Processed {i} files with {len(problem_files)} errors.")


In [None]:
txt_all[:100]

In [None]:
raw_text_train = txt_all.split('\n')
with open('jan_eyre_train.txt', mode='w', encoding='utf-8') as file:
    for record in raw_text_train:
        file.write(record + '\n')
        
        

In [None]:
from nltk.tree import Tree
tree = Tree.fromstring(txt_all)
# tree

In [None]:
def extract_ner_from_tree(tree):
    names = []
    for subtree in tree:
        if isinstance(subtree, Tree):  # Check if it is a subtree
            if subtree.label() == "PER":  # Check for the "PER" label
                names.append(' '.join(subtree.leaves()).lower())
            else:
                names.extend(extract_ner_from_tree(subtree))
    names = set(names)
    return names
ner_names = extract_ner_from_tree(tree)
ner_names

In [None]:
from nltk.tree import Tree
statement_words = ['said','exclaimed','replied','whispered','uttered','asserted','declared','stated','announced','mentioned','remarked','commented','noted','disclosed','pronounced','muttered','murmured','suggested','reported','articulated','narrated']
tree = Tree.fromstring(txt_all)

def replace_words(tree, replacement_text="_"):
    for idx, subtree in enumerate(tree):
        if isinstance(subtree, Tree):
            if subtree.label() == "PER":
                continue 
            else:
                replace_words(subtree, replacement_text)
        elif any(word.lower() in ner_names for word in tree[idx].split()):
            continue                
        elif any(word.lower() in statement_words for word in tree[idx].split()):
            continue
        else:
            tree[idx] = replacement_text

replace_words(tree)

tree_string_modified = tree.pformat(margin=100000000)
# print(tree_string_modified)


In [None]:
# input_file_path = 'jan_eyre_train.txt'
# output_file_path = 'jan_eyre_train.txt'

# with open(input_file_path, 'r', encoding='utf-8') as file:
#     lines = file.readlines()

# # Remove the first and last character from each line
# modified_lines = [line[1:-1] if len(line) > 1 else '' for line in lines]

# with open(output_file_path, 'w', encoding='utf-8') as file:
#     file.writelines(modified_lines)

In [None]:
data_dir = '/media/daniel/HDD1/AI574/gutenberg/data/raw'
selected_works_path = '/media/daniel/HDD1/AI574/Project/selected_works_dive2.csv'
line_chunk_size = 600
txt_all = ''
problem_files = []
with open(selected_works_path, mode='r', newline='') as file:
    csv_reader = csv.reader(file)
    for i, row in enumerate(csv_reader):
        if i % 500 == 1:
            print(i)
        target_path = os.path.join('/media/daniel/HDD1/AI574/gutenberg/data/raw',row[0]+'_raw.txt')
        lines_as_strings = []
        try:
            with open(target_path, mode='r', newline='') as file:
                lines = file.readlines()
                for j in range(0, len(lines), line_chunk_size):
                    chunk_lines = lines[j:j+line_chunk_size]
                    chunk_as_string = ' '.join(chunk_lines)
                    
                    chunk_as_string = chunk_as_string.replace('(','<')
                    chunk_as_string = chunk_as_string.replace(')','>')                      
                    
                    chunk_list = chunk_as_string.split()
                    chunk_as_string_clean = utils.remove_special_characters(chunk_as_string).lower()
                    chunk_for_ner = tokenize_and_convert_to_ids((chunk_as_string))
                    chunk_for_ner = tf.reshape(chunk_for_ner, shape=[1, -1])
                    ner_output = ner_model.predict(chunk_for_ner, verbose=0)
                    ner_prediction = np.argmax(ner_output, axis=-1)[0]
                    ner_prediction_desc = [MAPPING[i] for i in ner_prediction]
                    list_words_w_ner_indicators = list(zip(chunk_list, ner_prediction_desc))
                    str_words_w_ner_indicators = group_consecutive_words(list_words_w_ner_indicators)
                  
                    rejoined_words = per_tree(str_words_w_ner_indicators)
                    lines_as_strings.append(rejoined_words)
            file_lines = ' '.join(lines_as_strings)

            text_w_quotation_indicator = quote_indicator.quote_tree(row[0], file_lines)
            txt_all += text_w_quotation_indicator
        except Exception as e:
            problem_files.append(target_path)
            print(f"Error processing {target_path}: {e}")

print(f"Processed {i} files with {len(problem_files)} errors.")


In [None]:
raw_text_validate = txt_all.split('\n')
with open('wuthering_heights_validate.txt', mode='w', newline = '\n', encoding='utf-8') as file:
    writer = csv.writer(file)
    for record in raw_text_validate:
        writer.writerow([record])

In [None]:
import numpy as np
import string
import nltk
from nltk.tree import Tree
import svgling
from collections import Counter
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE
samp = SMOTE()
from transformers import AutoTokenizer, EarlyStoppingCallback, AutoModelForSequenceClassification
from keras.utils import to_categorical
from torch.utils.data import Dataset as TorchDataset
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from transformers import EvalPrediction, TrainingArguments, Trainer
import torch
scaler = GradScaler()
from datasets import load_dataset
from datasets import Dataset, DatasetDict
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
import string
from transformers import AutoTokenizer, EarlyStoppingCallback, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


from datasets import load_dataset
from datasets import Dataset, DatasetDict

MAX_LEN = 256
BATCH = 8
METRIC = 'eval_F1'
EPOCH = 30

In [None]:
def extract_sentiment_tree(tree):
    phrases = []
    phrases.append((' '.join(tree.leaves()), tree.label()))
    for subtree in tree:
        if isinstance(subtree, Tree):
            phrases.extend(extract_sentiment_tree(subtree))
    return phrases

In [None]:
def preprocess(data):
    all_input_ids = []
    all_attention_mask = []
    all_labels = []
    
    for raw_text in data['text']:
        
        tokens = []
        seen_tokens = set()
        
        raw_text = raw_text.strip()
        tree = Tree.fromstring(raw_text)    
        text_and_labels = extract_sentiment_tree(tree)
        for each in text_and_labels:
            line_token_str = str(each[0])
            if line_token_str not in seen_tokens:
                line_token_str = ''.join(ch for ch in line_token_str.lower() if ch not in string.punctuation)
                line_token = tokenizer(line_token_str, padding="max_length", truncation=True, max_length=MAX_LEN)
                label_idx = int(each[1])
                label_matrix = np.zeros(5)
                label_matrix[label_idx] = 1
                
                all_input_ids.append(line_token['input_ids'])
                all_attention_mask.append(line_token['attention_mask'])
                all_labels.append(label_matrix.tolist())
                seen_tokens.add(line_token_str)
                
    target_len = len(data["text"])
    
    all_input_ids = all_input_ids[:target_len] + [[0] * MAX_LEN] * (target_len - len(all_input_ids))
    all_attention_mask = all_attention_mask[:target_len] + [[0] * MAX_LEN] * (target_len - len(all_attention_mask))
    all_labels = all_labels[:target_len] + [[0, 0, 0, 0, 0]] * (target_len - len(all_labels))
    
    return {"input_ids": all_input_ids, "attention_mask": all_attention_mask, "labels": all_labels}

In [None]:
dataset = load_dataset('text', data_files={'train': 'jan_eyre_train.txt', 'validation': 'wuthering_heights_validate.txt'})
train_dataset = preprocess(dataset['train'])
validation_dataset = preprocess(dataset['validation'])

In [None]:
train_dataset = Dataset.from_dict(train_dataset)
validation_dataset= Dataset.from_dict(validation_dataset)

encoded_dataset = DatasetDict({
    "train":train_dataset,
    "validation": validation_dataset
})
encoded_dataset.set_format("torch")

In [None]:
loader = DataLoader(
    encoded_dataset,
    batch_size=BATCH,
    shuffle=True,
    num_workers=8,  # adjust based on your system's capabilities
    pin_memory=True
)

In [None]:
example = encoded_dataset['train'][0]
print(example.keys())

In [None]:
labels = ['Root', 'Quotation', 'Per', 'None1', 'None2']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           hidden_dropout_prob=0.5, #overfitting issue - overrode dropout default 0.1 to 0.2
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

In [None]:
args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=EPOCH,
    weight_decay=0.02, #Initial value 0.01 resulted in overfitting - tried 0.02 but did not correct
    load_best_model_at_end=True,
    metric_for_best_model=METRIC,
    save_total_limit = 3,
)

In [None]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {'eval_F1': f1_micro_average,
               'eval_roc_auc': roc_auc,
               'eval_accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
print(encoded_dataset['train']['input_ids'][0].unsqueeze(0).shape)
print(encoded_dataset['train']['attention_mask'][0].unsqueeze(0).shape)
print(encoded_dataset['train']['labels'][0].unsqueeze(0).shape)

In [None]:
outputs = model(
    input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0),
    attention_mask=encoded_dataset['train']['attention_mask'][0].unsqueeze(0),
    labels=encoded_dataset['train']['labels'][0].unsqueeze(0)
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience = 2)]
)

In [None]:
trainer.train()

In [None]:
# data_dir = '/media/daniel/HDD1/AI574/gutenberg/data/raw'
# selected_works_path = '/media/daniel/HDD1/AI574/Project/selected_works_idx.csv'
# import csv
# import re
# txt_all = ''
# problem_files = []
# line_chunk_size = 10
# with open(selected_works_path, mode='r', newline='') as file:
#     csv_reader = csv.reader(file)
#     i = 0
#     for row in csv_reader:
#         if i % 500 == 1:
#             print(i)
#         if i == 2:
#             break
#         target_path = os.path.join('/media/daniel/HDD1/AI574/gutenberg/data/raw',row[0]+'_raw.txt')
#         lines_as_strings = []
#         with open(target_path, mode='r', newline='') as file:
#             lines = file.readlines()
#             j = 0
#             for k in range(0, len(lines), line_chunk_size):
#                 j += 1
#                 if j == 30:
#                     break
#                 chunk_lines = lines[j:j+line_chunk_size]
#                 # print(chunk_lines)
                
#                 line_string = ' '.join(chunk_lines)
                
#                 line_list = line_string.split()
#                 line_for_ner = tokenize_and_convert_to_ids((line_string))
#                 line_for_ner = tf.reshape(line_for_ner, shape=[1, -1])
#                 ner_output = ner_model.predict(line_for_ner, verbose=0)
#                 ner_prediction = np.argmax(ner_output, axis=-1)[0]
#                 ner_prediction = [MAPPING[i] for i in ner_prediction]
#                 list_words_w_ner_indicators = list(zip(line_list, ner_prediction))
#                 words_w_ner_indicators = group_consecutive_words(list_words_w_ner_indicators)
#                 rejoined_words = per_tree(words_w_ner_indicators)
#                 lines_as_strings.append(rejoined_words)
                
# #              
                
#         file_lines = ' '.join(lines_as_strings)
#         text_w_quotation_indicator = quote_indicator.quote_tree(row[0], file_lines)
#         i += 1
                              

In [None]:
# txt_all

In [None]:
# data_dir = '/media/daniel/HDD1/AI574/gutenberg/data/raw'
# # selected_works_path = '/media/daniel/HDD1/AI574/Project/selected_works_idx.csv'
# selected_works_path = '/media/daniel/HDD1/AI574/Project/selected_works_dive.csv'
# import csv
# import re
# txt_all = ''
# problem_files = []
# with open(selected_works_path, mode='r', newline='') as file:
#     csv_reader = csv.reader(file)
#     i = 0
#     for row in csv_reader:
#         if i % 500 == 1:
#             print(i)
#         if i == 5:
#             break
#         target_path = os.path.join('/media/daniel/HDD1/AI574/gutenberg/data/raw',row[0]+'_raw.txt')
#         # print(target_path)
#         lines_as_strings = []
#         # try:
#         with open(target_path, mode='r', newline='') as file:
#             # csv_reader = csv.reader(file)
#             lines = file.readlines()
#             j = 0
#             for line in lines:
#                 j += 1
#                 if j == 24:
#                     break
#                 line_string = ' '.join(line)
#                 line_list = line.split()
#                 line_for_ner = tokenize_and_convert_to_ids((line))
#                 line_for_ner = tf.reshape(line_for_ner, shape=[1, -1])
#                 ner_output = ner_model.predict(line_for_ner, verbose=0)
#                 ner_prediction = np.argmax(ner_output, axis=-1)[0]
#                 ner_prediction = [MAPPING[i] for i in ner_prediction]
#                 words_w_ner_indicators = list(zip(line_list, ner_prediction))
#                 words_w_ner_indicators = group_consecutive_words(words_w_ner_indicators)                 
#                 rejoined_words = per_tree(words_w_ner_indicators)
#                 lines_as_strings.append(rejoined_words)
#         file_lines = ' '.join(lines_as_strings)

#         # all_lines = utils.remove_special_characters(all_lines, remove_digits=True).lower()    
#         # all_lines = all_lines[1:100]
#         text_w_quotation_indicator = quote_indicator.quote_tree(row[0], file_lines)
        

#         i += 1
                              

In [None]:
# text_w_quotation_indicator

In [None]:
test = '(' + text_w_quotation_indicator[950:1250] + ')'
test

In [None]:
from nltk.tree import Tree
tree = Tree.fromstring(test)
# tree.pretty_print()
tree

In [None]:
records = txt_all.split('\n')

with open('gutenberg_scored.txt', mode='w', newline = '\n', encoding='utf-8') as file:
    writer = csv.writer(file)
    for record in records:
        writer.writerow([record])

In [None]:
cur_txt = cur_txt[:20000]
orig_words = cur_txt.split()
# print(cur_txt)

In [None]:
cur_txt

In [None]:
split_by_sentence = cur_txt.split('.')
split_by_sentence

In [None]:
sample_input = tokenize_and_convert_to_ids(cur_txt)
sample_input = tf.reshape(sample_input, shape=[1, -1])
# print(sample_input)

output = ner_model.predict(sample_input)
prediction = np.argmax(output, axis=-1)[0]
prediction = [MAPPING[i] for i in prediction]
# print(prediction)

In [None]:
positions = [index for index, value in enumerate(prediction) if value != 'O']
NER_words = list(set([orig_words[index] for index in positions]))
NER_words


In [None]:
utils.named_persons_w_spacy(cur_txt)