<a href="https://colab.research.google.com/github/aishwarya0708/Redfin_ETL/blob/main/GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization,Dropout
from tensorflow.keras import Model

**Self Attention**- to understand how words are related to eachother

In [36]:
class MultiHeadSelfAttention(Layer):

   def __init__(self, embed_dim, num_heads):
     super().__init__()
     self.embed_dim=embed_dim
     self.num_heads=num_heads
     self.attn_head_size=embed_dim//num_heads
     self.query_dense=Dense(embed_dim)
     self.key_dense=Dense(embed_dim)
     self.value_dense=Dense(embed_dim)

     self.dense=Dense(embed_dim)
   #single head has to be split into 12 heads
   def split_heads(self,x,batch_size):
      x=tf.reshape(x,(batch_size,-1,self.num_heads,self.attn_head_size))
      return tf.transpose(x,perm=[0,2,1,3])

   def call(self,query_dense,key_dense,value_dense, mask=None): # Added mask=None
      batch_size=tf.shape(query_dense)[0]
      query=self.query_dense(query_dense)
      key=self.key_dense(key_dense)
      value=self.value_dense(value_dense)

      matmul_qk=tf.matmul(query,key,transpose_b=True) # Changed query_dense to query
      dk=tf.cast(tf.shape(key)[-1],tf.float32) # Changed key_dense to key
      scaled_attention_logits=matmul_qk/tf.math.sqrt(dk)
#scores of words without a relationship are changed to 0
      if mask is not None:
        scaled_attention_logits+=(mask*-1e9)

      attention_weights=tf.nn.softmax(scaled_attention_logits,axis=-1)
      output=tf.matmul(attention_weights,value)
      #original shape
      concat_attention=tf.reshape(output,shape=(batch_size,-1,self.embed_dim))
      return self.dense(concat_attention)

Feed Forward Neural Network


In [33]:
class FeedForwardNetwork(Layer):
    def __init__(self, embed_dim, dff):
        super().__init__()
        self.dense1 = Dense(dff, activation='gelu')
        self.dense2 = Dense(embed_dim)

    def call(self, x):
        return self.dense2(self.dense1(x))

Implement teansformer block from Layer


In [30]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = FeedForwardNetwork(embed_dim, dff)
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, x, mask=None):
        attn_output = self.att(x, x, x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.norm2(out1 + ffn_output)
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = FeedForwardNetwork(embed_dim, dff)
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, x, mask=None):
        attn_output = self.att(x, x, x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.norm2(out1 + ffn_output)

In [40]:
class GPT2(Model):
  def __init__(self,vocab_size,max_length,embed_dim=768,num_heads=12,dff=3072,num_layers=12,dropout_rate=0.1):
    super().__init__()
    self.token_emb=Embedding(vocab_size,embed_dim)
    self.pos_emb=Embedding(vocab_size,embed_dim)
    self.embedding=Embedding(vocab_size,embed_dim)
    self.transformer_blocks=tf.keras.Sequential([TransformerBlock(embed_dim,num_heads,dff,dropout_rate) for num in range(num_layers)])
    self.norm=LayerNormalization(epsilon=1e-6)
    self.out=Dense(vocab_size)

    #create mask
  def create_casual_mask(self,seq_len):
    mask=tf.linalg.band_part(tf.ones((seq_len,seq_len)),-1,0)
    return 1-mask

  def call(self,x):
    seq_len=tf.shape(x)[1]
    casual_mask=self.create_casual_mask(seq_len)
    token_embeddings=self.token_emb(x)
    pos_embeddings=self.pos_emb(tf.range(start=0,limit=seq_len,delta=1))
    embeddings=token_embeddings+pos_embeddings

    #runs 12 times
    # Iterate through the layers in transformer_blocks using .layers
    for transformer in self.transformer_blocks.layers:
      embeddings=transformer(embeddings,casual_mask)
    # Removed extra call to self.out
    return self.out(embeddings) # Only call self.out once

In [42]:
VOCAB_SIZE=50257
MAX_LENGTH=1824
inputs = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
outputs = GPT2(vocab_size=VOCAB_SIZE, max_length=MAX_LENGTH)(inputs)
# The second argument to Model should be the output tensor, not MAX_LENGTH
gpt2 = Model(inputs, outputs)  # Changed MAX_LENGTH to outputs

gpt2.summary()

