In [2]:
!pip install transformers




In [3]:
import os
file_path = '/content/drive/MyDrive/nlp/proj/nngen/data'
train_diffs_path = os.path.join(file_path, 'cleaned.train.diff')
train_msgs_path = os.path.join(file_path, 'cleaned.train.msg')
valid_diffs_path = os.path.join(file_path, 'cleaned.valid.diff')
valid_msgs_path = os.path.join(file_path, 'cleaned.valid.msg')

In [4]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n')
    return data

# Load the data
train_diffs = load_data(train_diffs_path)
train_msgs = load_data(train_msgs_path)
valid_diffs = load_data(valid_diffs_path)
valid_msgs = load_data(valid_msgs_path)

# Prepare the tokenizer and fit on the diffs and commit messages
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_diffs + train_msgs)

# Convert texts to sequences
train_diffs_seq = tokenizer.texts_to_sequences(train_diffs)
train_msgs_seq = tokenizer.texts_to_sequences(train_msgs)

# Padding sequences to a fixed length (you may choose a length suitable for your data)
max_len_diffs = max([len(seq) for seq in train_diffs_seq])
max_len_msgs = max([len(seq) for seq in train_msgs_seq])

train_diffs_seq_padded = pad_sequences(train_diffs_seq, maxlen=max_len_diffs, padding='post')
train_msgs_seq_padded = pad_sequences(train_msgs_seq, maxlen=max_len_msgs, padding='post')


In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Attention

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 256
lstm_units = 128

# Encoder
encoder_inputs = Input(shape=(max_len_diffs,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_len_msgs,))
decoder_embedding = Embedding(vocab_size, embedding_dim)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding(decoder_inputs), initial_state=encoder_states)

# Attention layer
attention = Attention()
attention_out = attention([decoder_outputs, encoder_outputs])

# Concat attention output and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_out])

# Dense layer
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model that will turn encoder_input_data & decoder_input_data into decoder_target_data
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 120)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 27)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 120, 256)             1268454   ['input_1[0][0]']             
                                                          4                                       
                                                                                                  
 embedding_1 (Embedding)     (None, 27, 256)              1268454   ['input_2[0][0]']         

In [6]:
# One-hot encode the target sequences
def one_hot_encode(sequences, max_len, vocab_size):
    one_hot = np.zeros((len(sequences), max_len, vocab_size), dtype='float32')

    for i, seq in enumerate(sequences):
        for j, idx in enumerate(seq):
            one_hot[i, j, idx] = 1.

    return one_hot

# No need to one-hot encode, just ensure the targets are properly shaped
train_msgs_seq_padded = train_msgs_seq_padded.reshape((train_msgs_seq_padded.shape[0], train_msgs_seq_padded.shape[1], 1))


# Train the model
# Train the model with integer sequences
model.fit(
    [train_diffs_seq_padded, train_msgs_seq_padded],
    train_msgs_seq_padded,  # Integer sequences used directly
    batch_size=64,
    epochs=10,
    validation_split=0.2
)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b88cc2afa30>

In [6]:
model.save('/content/drive/MyDrive/nlp/proj/nngen/s2s.h5')