<a href="https://colab.research.google.com/github/archietelfer7/AMNAS/blob/main/AMNAS_v1_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
#libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [65]:
# Reading the articles from text file
with open('train.txt.src.tokenized.fixed.cleaned.final.truncated.txt', 'r') as f:
  articles = f.readlines()

# Reading the summaries from text file
with open('train.txt.tgt.tokenized.fixed.cleaned.final.truncated.txt', 'r') as f:
  summaries = f.readlines()

# combining the articles and summaries into a single dataset 
dataset = list(zip(articles, summaries))

In [37]:
# Defining a Textvectorization Layer with a vocab-size limit = 8000,
# outputting sequences of 700, padded with '0'
vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=8000,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=700,
    pad_to_max_tokens=False,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding='utf-8',
)

# calling the adapt() function on the dataset to generate mappings between string
# tokens and indices
vectorizer.adapt([article for article, summary in dataset] + [summary for article, summary in dataset])
print(vectorizer.get_vocabulary())

tf.Tensor(
[ 169 4412 1296   14   19    8    9   81  386 3927   14   19    8    2
   78  159    5    2  263   56   11   47    1 1008    2 3479    5  506
  350  147    6  982  350   28   64    1   10   47  500 1047    3 2312
 1378   10    2   62 4363  935   11  839 3484    2    1   17  660  324
  303   48  121   81 1732   47    5    2  108  767    1   10  115    2
 1424    1  102    2 1010  263 4566   28   11    1   48 1047    4   26
  690   96    4    1    1   23 4477 1167    1 1578    4    1 1047  436
    7 1083    2 3654 1156   13  490    4  978 6237   17 7336  176   17
    1    1   53   19  407   26 1732 2920 6421   98    6   39    2  283
    1    2    1  784  553    4    1 3079  521    2  688 2273    6   97
   51   19  149  176  294  177   19   38   51  659   61    4 3268    2
  166   41 4968 1070  124 6177   10 4362   87  263    3 5724    9    2
   62 1424  100   36   26 1250 1766  527    4    1    1  609 1953    2
 3654 1156    1   29 1776  107   49  590 1260  529   11  219    2 

In [67]:
# Define the embedding dimensionality
embedding_dim = 128

# Define the encoder network:
# input layer to take in article/summary vectors of dimensionality 700
encoder_inputs = tf.keras.Input(shape=(700,), dtype=tf.int64)
# embedding layer with input dimensions == vocab size + 1 for OOV token 
encoder_embeddings = tf.keras.layers.Embedding(input_dim=7001, output_dim=embedding_dim)(encoder_inputs)
# LSTM layer that takes the embeddings as input
encoder_lstm = tf.keras.layers.LSTM(units=256)(encoder_embeddings)
# the encoder output layer (dense)
encoder_outputs = tf.keras.layers.Dense(units=256, activation='relu')(encoder_lstm)

# Define the decoder network:
# decoder inputs, shape is set to 699 to account for summary offset,
# shape was originally set to 700
decoder_inputs = tf.keras.Input(shape=(699,), dtype=tf.int64)
# decoder embedding layer 
decoder_embeddings = tf.keras.layers.Embedding(input_dim=7001, output_dim=embedding_dim)(decoder_inputs)
# decoder lstm layer
decoder_lstm = tf.keras.layers.LSTM(units=256, return_sequences=True)(decoder_embeddings, initial_state=[encoder_outputs, encoder_outputs])
# dense layer wrapped in timedistirbuted layer to help capture dependencies 
# across multiple timesteps
decoder_outputs = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=7001, activation='softmax'))(decoder_lstm)

# Define the encoder-decoder model
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)

# Compile the model using adam optimiser and SCC as the loss function with
# accuracy metric to analyse summary accuracy against targets
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# view the model architecture
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_16 (InputLayer)          [(None, 700)]        0           []                               
                                                                                                  
 embedding_15 (Embedding)       (None, 700, 128)     896128      ['input_16[0][0]']               
                                                                                                  
 input_17 (InputLayer)          [(None, 699)]        0           []                               
                                                                                                  
 lstm_15 (LSTM)                 (None, 256)          394240      ['embedding_15[0][0]']           
                                                                                            

In [68]:
# vectorise the articles and summaries before fitting the model to the data
encoder_input = vectorizer(articles)
decoder_input = vectorizer(summaries)

In [69]:
# attempting to fit the model to the data - currently loss : 'nan' during training
history = model.fit(
    [encoder_input, decoder_input[:, :-1]],  # inputs
    decoder_input[:, 1:],  # targets (shifted by one)
    epochs=5, batch_size=32)


Epoch 1/5

KeyboardInterrupt: ignored