In [1]:
# Import necessary libraries
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("SummarizationApp") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Define the schema for the data
schema = StructType([
    StructField("id", StringType(), True),
    StructField("article", StringType(), True),
    StructField("highlights", StringType(), True)
])

# Read CSV files in HDFS using the defined schema
train_data = spark.read.schema(schema).option("header", "true").csv("hdfs:///cnn_dailymail/train.csv")
test_data = spark.read.schema(schema).option("header", "true").csv("hdfs:///cnn_dailymail/test.csv")
validation_data = spark.read.schema(schema).option("header", "true").csv("hdfs:///cnn_dailymail/validation.csv")

# Show the first 5 rows of each dataset (optional)
train_data.show(5)
test_data.show(5)
validation_data.show(5)

                                                                                

+--------------------+--------------------+--------------------+
|                  id|             article|          highlights|
+--------------------+--------------------+--------------------+
|0001d1afc246a7964...|By . Associated P...|Bishop John Folda...|
|He contracted the...|                null|                null|
|Church members in...| Grand Forks and ...|                null|
|0002095e55fcbd3a2...|"(CNN) -- Ralph M...|"" of using his r...|
|          Ralph Mata| an internal affa...| allegedly helped...|
+--------------------+--------------------+--------------------+
only showing top 5 rows

+--------------------+--------------------+--------------------+
|                  id|             article|          highlights|
+--------------------+--------------------+--------------------+
|92c514c913c0bdfe2...|Ever noticed how ...|Experts question ...|
|U.S consumer advi...|                null|                null|
|Safety tests cond...|                null|                null|


## Data Cleaning

In [2]:
# Remove the "id" column from the dataframes
train_data = train_data.drop("id")
test_data = test_data.drop("id")
validation_data = validation_data.drop("id")

# Drop rows with missing values in the "article" or "highlights" columns
train_data = train_data.dropna(subset=["article", "highlights"])
test_data = test_data.dropna(subset=["article", "highlights"])
validation_data = validation_data.dropna(subset=["article", "highlights"])

## Text Cleaning

In [3]:
from pyspark.sql.functions import udf
import re

# Define a function to clean text
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = text.lower().strip()
    return text

# UDF for PySpark DataFrame
clean_udf = udf(clean_text, StringType())

# Apply cleaning to the datasets
train_data = train_data.withColumn("article", clean_udf(train_data["article"])).withColumn("highlights", clean_udf(train_data["highlights"]))
test_data = test_data.withColumn("article", clean_udf(test_data["article"])).withColumn("highlights", clean_udf(test_data["highlights"]))
validation_data = validation_data.withColumn("article", clean_udf(validation_data["article"])).withColumn("highlights", clean_udf(validation_data["highlights"]))
# Show the first 5 rows of dataframes
train_data.show(5)
test_data.show(5)
validation_data.show(5)


[Stage 3:>                                                          (0 + 1) / 1]

                                                                                

+--------------------+--------------------+
|             article|          highlights|
+--------------------+--------------------+
|by   associated p...|bishop john folda...|
|cnn     ralph mat...|of using his role...|
|an internal affai...|allegedly helped ...|
|a drunk driver wh...|craig eccleston t...|
|cnn     with a br...|nina dos santos s...|
+--------------------+--------------------+
only showing top 5 rows

+--------------------+--------------------+
|             article|          highlights|
+--------------------+--------------------+
|ever noticed how ...|experts question ...|
|a drunk teenage b...|drunk teenage boy...|
|                  17|ran towards anima...|
|dougie freedman i...|nottingham forest...|
|liverpool target ...|fiorentina goalke...|
+--------------------+--------------------+
only showing top 5 rows

+--------------------+--------------------+
|             article|          highlights|
+--------------------+--------------------+
|sally forrest  an...|sall

## Data Inspection and Preparation

In [4]:
# Print the row count of each dataframe
print("Train Data Row Count:", train_data.count())
print("Test Data Row Count:", test_data.count())
print("Validation Data Row Count:", validation_data.count())

                                                                                

Train Data Row Count: 351784
Test Data Row Count: 14149
Validation Data Row Count: 16586


In [5]:
# Define the desired row limits for each dataset
row_limits = {
    "train_data": 1000,
    "test_data": 250,
    "validation_data": 250
}

# Limit each dataset to the specified number of rows
for dataset_name, row_limit in row_limits.items():
    globals()[dataset_name] = globals()[dataset_name].limit(row_limit)

print("Train Data Row Count:", train_data.count())
print("Test Data Row Count:", test_data.count())
print("Validation Data Row Count:", validation_data.count())



                                                                                

Train Data Row Count: 1000
Test Data Row Count: 250
Validation Data Row Count: 250


## Text Transformation and Preparation

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")

# Fit the tokenizer on the training data (both articles and highlights)
all_train_texts = train_data.select("article").rdd.flatMap(lambda x: x).collect() + train_data.select("highlights").rdd.flatMap(lambda x: x).collect()
tokenizer.fit_on_texts(all_train_texts)

# Convert text data into sequences of integers
train_articles_seq = tokenizer.texts_to_sequences(train_data.select("article").rdd.flatMap(lambda x: x).collect())
train_highlights_seq = tokenizer.texts_to_sequences(train_data.select("highlights").rdd.flatMap(lambda x: x).collect())

test_articles_seq = tokenizer.texts_to_sequences(test_data.select("article").rdd.flatMap(lambda x: x).collect())
test_highlights_seq = tokenizer.texts_to_sequences(test_data.select("highlights").rdd.flatMap(lambda x: x).collect())

val_articles_seq = tokenizer.texts_to_sequences(validation_data.select("article").rdd.flatMap(lambda x: x).collect())
val_highlights_seq = tokenizer.texts_to_sequences(validation_data.select("highlights").rdd.flatMap(lambda x: x).collect())


2023-10-10 20:43:07.754844: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-10 20:43:08.257385: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-10-10 20:43:08.257411: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-10-10 20:43:08.320055: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-10 20:43:09.625053: W tensorflow/stream_executor/platform/defaul

In [24]:
# Define the maximum lengths for sequences
MAX_LEN_ARTICLE = 1768
MAX_LEN_HIGHLIGHT = 1769

# Perform padding
train_articles_seq = pad_sequences(train_articles_seq, maxlen=MAX_LEN_ARTICLE, padding="post", truncating="post")
train_highlights_seq = pad_sequences(train_highlights_seq, maxlen=MAX_LEN_HIGHLIGHT, padding="post", truncating="post")

test_articles_seq = pad_sequences(test_articles_seq, maxlen=MAX_LEN_ARTICLE, padding="post", truncating="post")
test_highlights_seq = pad_sequences(test_highlights_seq, maxlen=MAX_LEN_HIGHLIGHT, padding="post", truncating="post")

val_articles_seq = pad_sequences(val_articles_seq, maxlen=MAX_LEN_ARTICLE, padding="post", truncating="post")
val_highlights_seq = pad_sequences(val_highlights_seq, maxlen=MAX_LEN_HIGHLIGHT, padding="post", truncating="post")


In [25]:
import random
# Select random indices
random_idx = random.sample(range(len(train_articles_seq)), 5)
# Display five randomly selected examples
for idx in random_idx:
    # Display the original article and its tokenized and padded version
    print("Original Article:", train_data.select("article").rdd.flatMap(lambda x: x).collect()[idx])
    print("Tokenized and Padded Article:", train_articles_seq[idx])
    # Display the original highlight and its tokenized and padded version
    print("Original Highlight:", train_data.select("highlights").rdd.flatMap(lambda x: x).collect()[idx])
    print("Tokenized and Padded Highlight:", train_highlights_seq[idx])
     # Draw a line between two items
    print("-" * 50)


                                                                                

Original Article: fracking firms will be allowed to access vast reserves of underground gas without the permission of landowners under controversial laws being drawn up by the government  ministers are preparing an overhaul of trespass legislation to make it easier for firms to ignore objections  one source said the reform  which will infuriate anti fracking campaigners  was likely to be included in the queen s speech setting out the government s plans for its final year  anger  police tackle protesters at balcombe  west sussex  who were objecting to a test drilling site there   chancellor george osborne is offering generous tax breaks to kickstart the technology  he believes fracking for shale gas could herald an energy revolution that will boost the economy  make britain more self sufficient and put an end to sky high bills from greedy energy firms  scientists say the uk is sitting on deposits of enough shale gas to supply the whole country for at least 40 years  mirroring the north 

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zi

Original Highlight: scientists say uk is sitting on enough shale gas for 40 years  supply
Tokenized and Padded Highlight: [1862  176  258 ...    0    0    0]
--------------------------------------------------


                                                                                

Tokenized and Padded Article: [  21 1276 4510 ...    0    0    0]


  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

Original Highlight: jodie watson caught chlamydia from a partner when she was 16
Tokenized and Padded Highlight: [9001 2018  883 ...    0    0    0]
--------------------------------------------------


                                                                                

Original Article: robin emmons has grown more than 26
Tokenized and Padded Article: [ 3077 14237    26 ...     0     0     0]


Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
Traceback (most recent call last):====>                            (5 + 3) / 10]
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError


Original Highlight: 000 pounds of fruits and vegetables
Tokenized and Padded Highlight: [  94 1098    6 ...    0    0    0]
--------------------------------------------------


                                                                                

Original Article: by   ted thornhill   last updated at 2 18 pm on 30th december 2011   bored of your surroundings and want to liven things up  or fancy destroying that broken bit of office machinery  now there s an app that ll add a bit of action movie magic into your life by super imposing dramatic special effects over mobile phone footage  action movie fx for iphone comes courtesy of j j  abrams  production company bad robot interactive  which is behind star trek  super 8 and mission impossible 3  scroll down for mailonline s blockbuster debut   kaboom  a mailonline reporter finds himself in the line of fire   dramatic  the app s effects are incredibly realistic   leave it to the professionals  the company behind the app  bad robot  has plenty of expertise with movie special effects   the free app comes with two effects   a missile strike and a car crash  and the results are incredible  simply hit record and film a scene  then use a slider to mark the point at which you want the  fx 

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
  File "/usr/local/spark/python/lib/pyspark.zi

Original Highlight: the free iphone app comes with a  missile strike  and a  car crash
Tokenized and Padded Highlight: [   2  514 1572 ...    0    0    0]
--------------------------------------------------


                                                                                

Original Article: cnn     harvard is caught up in a student cheating scandal that its dean of undergraduate education calls   unprecedented in its scope and magnitude    as a harvard grad
Tokenized and Padded Article: [ 128 5750   13 ...    0    0    0]


Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                              

Original Highlight: i am embarrassed
Tokenized and Padded Highlight: [  22  436 4549 ...    0    0    0]
--------------------------------------------------


In [26]:
# Calculate the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 24972


In [27]:
# Calculate the lengths of articles and highlights
article_lengths = [len(seq) for seq in train_articles_seq]
highlight_lengths = [len(seq) for seq in train_highlights_seq]

# Calculate and print average article length
print(f"Average Article Length: {sum(article_lengths)/len(article_lengths)}")

# Calculate and print average highlight length
print(f"Average Highlight Length: {sum(highlight_lengths)/len(highlight_lengths)}")

# Calculate and print maximum article length
print(f"Max Article Length: {max(article_lengths)}")

# Calculate and print maximum highlight length
print(f"Max Highlight Length: {max(highlight_lengths)}")

Average Article Length: 1768.0
Average Highlight Length: 1769.0
Max Article Length: 1768
Max Highlight Length: 1769


In [28]:
# Find empty article sequences (sequences with length 0)
empty_articles = [seq for seq in train_articles_seq if len(seq) == 0]

# Find empty highlight sequences (sequences with length 0)
empty_highlights = [seq for seq in train_highlights_seq if len(seq) == 0]

# Print the number of empty article sequences
print(f"Number of empty article sequences: {len(empty_articles)}")

# Print the number of empty highlight sequences
print(f"Number of empty highlight sequences: {len(empty_highlights)}")

Number of empty article sequences: 0
Number of empty highlight sequences: 0


In [29]:

# Get the index of the OOV token in the word index
oov_token_index = tokenizer.word_index[tokenizer.oov_token]

# Count the number of articles containing OOV tokens
oov_in_articles = sum([1 for seq in train_articles_seq if oov_token_index in seq])

# Count the number of highlights containing OOV tokens
oov_in_highlights = sum([1 for seq in train_highlights_seq if oov_token_index in seq])

# Print the number of articles with OOV tokens
print(f"Number of articles with OOV tokens: {oov_in_articles}")

# Print the number of highlights with OOV tokens
print(f"Number of highlights with OOV tokens: {oov_in_highlights}")

Number of articles with OOV tokens: 0
Number of highlights with OOV tokens: 0


## Model Building

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Hyperparameters
EMBEDDING_DIM = 256  # Dimension of word embeddings
HIDDEN_UNITS = 512   # Number of LSTM units

# Vocabulary sizes - add 1 for padding token
vocab_size_articles = 24972 + 1
vocab_size_highlights = 24972 + 1

# Maximum sequence lengths
max_length_articles = 1768
max_length_highlights = 1768

# Encoder
encoder_inputs = Input(shape=(max_length_articles,))
encoder_embedding = Embedding(vocab_size_articles, EMBEDDING_DIM)(encoder_inputs)
encoder_lstm = LSTM(HIDDEN_UNITS, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_length_highlights,))
decoder_embedding_layer = Embedding(vocab_size_highlights, EMBEDDING_DIM)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_lstm = LSTM(HIDDEN_UNITS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_highlights, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compilation
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 1768)]       0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 1768)]       0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 1768, 256)    6393088     ['input_7[0][0]']                
                                                                                                  
 embedding_4 (Embedding)        (None, 1768, 256)    6393088     ['input_8[0][0]']                
                                                                                            

## Model Training

In [21]:
from tensorflow.keras.layers import Embedding

# Dimension of word embeddings (a hyperparameter you can define)
embedding_dim = 256

# Create an embedding matrix and specify its dimensions
embedding_layer = Embedding(vocab_size_articles, embedding_dim, input_length=max_length_articles)

In [22]:
encoder_inputs = Input(shape=(max_length_articles,))
encoder_embedding = embedding_layer(encoder_inputs)  # Transformation with the embedding matrix
encoder_lstm = LSTM(HIDDEN_UNITS, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

In [24]:
from tensorflow.keras.models import Model

# Create the encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Define decoder inputs and initial_state
decoder_state_input_h = Input(shape=(HIDDEN_UNITS,))
decoder_state_input_c = Input(shape=(HIDDEN_UNITS,))
decoder_states_input = [decoder_state_input_h, decoder_state_input_c]

# Reuse the embedding layer and LSTM layer for the decoder
decoder_embedding2 = embedding_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_embedding2, initial_state=decoder_states_input)
decoder_states2 = [state_h2, state_c2]

# Reuse the output layer
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Create the decoder model
decoder_model = Model([decoder_inputs] + decoder_states_input, [decoder_outputs2] + decoder_states2)

# Now, combine the encoder and decoder models
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_outputs = decoder_dense(decoder_outputs)

# Create and compile the overall model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [18]:
# Prepare decoder input data (remove the last token)
decoder_input_data = train_highlights_seq[:, :-1]

# Prepare decoder target data (remove the first token)
decoder_target_data = train_highlights_seq[:, 1:]

# Prepare training data
encoder_input_data = train_articles_seq  # Articles as encoder inputs

# Check the data dimensions
print("Encoder Input Shape:", encoder_input_data.shape)
print("Decoder Input Shape:", decoder_input_data.shape)
print("Decoder Target Shape:", decoder_target_data.shape)

# Train the model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=64, epochs=10, validation_split=0.2)

Encoder Input Shape: (1000, 1768)
Decoder Input Shape: (1000, 1768)
Decoder Target Shape: (1000, 1768)
Epoch 1/10


ValueError: in user code:

    File "/home/hduser/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/home/hduser/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/hduser/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/home/hduser/.local/lib/python3.10/site-packages/keras/engine/training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/hduser/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "/home/hduser/.local/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/hduser/.local/lib/python3.10/site-packages/keras/losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/hduser/.local/lib/python3.10/site-packages/keras/losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/hduser/.local/lib/python3.10/site-packages/keras/losses.py", line 1990, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/home/hduser/.local/lib/python3.10/site-packages/keras/backend.py", line 5529, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1768) and (None, 1768, 24973) are incompatible
