# **Mounting Google Drive**

In [None]:
from google.colab import drive

# Mount Google Drive with automatic authorization
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# **Using GPU**

In [None]:
pip install torch



In [None]:
# Check if a GPU is available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# **Loading of dataset and creating a vocabulary in a code-related context**

In [None]:
import torch
from io import open
import json
from sklearn.preprocessing import LabelEncoder
import tokenize
from io import BytesIO

class CodeVocabulary:
    def __init__(self):
        self.token2index = {"<s>": 0, "</s>": 1}
        self.index2token = {0: "<s>", 1: "</s>"}
        self.n_tokens = 2

    def getIndex(self, token):
        if token not in self.token2index:
            self.token2index[token] = self.n_tokens
            self.index2token[self.n_tokens] = token
            self.n_tokens += 1
        return self.token2index[token]

def tokenizeCode(code):
    tokens = []
    for tok in tokenize.tokenize(BytesIO(code.encode('utf-8')).readline):
        if tok.type == tokenize.NAME or tok.type == tokenize.OP:
            tokens.append(tok.string)
    return tokens

def normalizeCode(code):
    tokens = tokenizeCode(code)
    normalized_code = ' '.join(tokens)
    return normalized_code

def readCodeCorpus(file, num_lines=50000):
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()[:num_lines]

    data = [json.loads(line) for line in lines]
    code_corpus = ["<s> " + normalizeCode(item["snippet"]) + " </s>" for item in data]

    # Extract intents and encode them using LabelEncoder
    intents = [item["intent"] for item in data]
    label_encoder = LabelEncoder()
    encoded_intents = label_encoder.fit_transform(intents)

    return code_corpus, encoded_intents

def readCodeCorpus_json(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    code_corpus = ["<s> " + normalizeCode(item["snippet"].split()) + " </s>" for item in data]

    # Extract intents and encode them using LabelEncoder
    intents = [item["intent"] for item in data]
    label_encoder = LabelEncoder()
    encoded_intents = label_encoder.fit_transform(intents)

    return code_corpus, encoded_intents

# Usage
code_file_path = "/content/drive/MyDrive/conala-mined.jsonl"
code_vocab = CodeVocabulary()
code_corpus, encoded_intents = readCodeCorpus(code_file_path, num_lines=50000)
code_numbered_corpus = [[code_vocab.getIndex(token) for token in code.split(' ')] for code in code_corpus]


# test_file_path = "/content/drive/MyDrive/conala-test.json"
# test_code_corpus, test_intents = readCodeCorpus_json(test_file_path)
# test_numbered_corpus = [[code_vocab.getIndex(token) for token in code.split(' ')] for code in test_code_corpus]

# **Defining an RNN model using GRU taking reference from a machine translation task**

In [None]:
# similarity_threshold = 0.05
class CodeRNN(torch.nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size):
        super(CodeRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(vocab_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self, device):
        return torch.zeros(1, 1, self.hidden_size).to(device)

def trainCodeModel(vocab, numbered_corpus, intents, hidden_size=256, learning_rate=0.01, n_epochs=30, device=device):
    code_rnn = CodeRNN(vocab.n_tokens, hidden_size, len(set(intents))).to(device)
    optimizer = torch.optim.SGD(code_rnn.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0

        for code_sequence, intent in zip(numbered_corpus, intents):
            optimizer.zero_grad()
            code_loss = 0
            hidden = code_rnn.initHidden(device)

            # Include intent information in the forward pass
            for i in range(len(code_sequence) - 1):
                input_tensor = torch.tensor([[code_sequence[i]]]).to(device)
                output, hidden = code_rnn.forward(input_tensor, hidden)
                target_tensor = torch.tensor([code_sequence[i + 1]]).view(-1).to(device)

            # Include intent information in the loss calculation
            intent_tensor = torch.tensor([intent]).view(-1).to(device)
            intent_loss = criterion(output, intent_tensor)
            code_loss += intent_loss

            total_loss += code_loss.data.item()
            # Calculate accuracy
            _, predicted = torch.max(output, 1)
            total_predictions += 1
            correct_predictions += (predicted == intent_tensor).sum().item()

            code_loss.backward()
            optimizer.step()

        accuracy = correct_predictions / total_predictions
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(numbered_corpus)}, Train Accuracy: {accuracy}")

    return code_rnn

# Example usage:
trained_code_model = trainCodeModel(code_vocab, code_numbered_corpus, encoded_intents)

Epoch 1, Loss: 9.62987787319546, Train Accuracy: 0.0048
Epoch 2, Loss: 8.822366762380302, Train Accuracy: 0.014733333333333333
Epoch 3, Loss: 7.788806717820155, Train Accuracy: 0.038533333333333336
Epoch 4, Loss: 6.642575925120774, Train Accuracy: 0.0823
Epoch 5, Loss: 5.545598973507496, Train Accuracy: 0.1435
Epoch 6, Loss: 4.563882820221036, Train Accuracy: 0.23246666666666665
Epoch 7, Loss: 3.7094886477756623, Train Accuracy: 0.3364
Epoch 8, Loss: 2.9854286018696303, Train Accuracy: 0.4395
Epoch 9, Loss: 2.4027039745451884, Train Accuracy: 0.5349666666666667
Epoch 10, Loss: 1.9612885263240585, Train Accuracy: 0.6151666666666666
Epoch 11, Loss: 1.6421351078849906, Train Accuracy: 0.6697333333333333
Epoch 12, Loss: 1.4089344823644807, Train Accuracy: 0.7149333333333333
Epoch 13, Loss: 1.2344058115921295, Train Accuracy: 0.7483333333333333
Epoch 14, Loss: 1.1033259812533078, Train Accuracy: 0.7736333333333333
Epoch 15, Loss: 1.006251284126239, Train Accuracy: 0.7909
Epoch 16, Loss: 0.9

# **Saving the model for testing in real time**

In [None]:
# Define the path to save the model
model_save_path = "/content/drive/MyDrive/vlg_model.pth"

# Save the trained model
torch.save(trained_code_model, model_save_path)

print(f"Model saved to {model_save_path}")

# **Real Time Testing**

In [None]:
class CodeRNN(torch.nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size):
        super(CodeRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(vocab_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self, device):
        return torch.zeros(1, 1, self.hidden_size).to(device)

def trainCodeModel(vocab, numbered_corpus, intents, hidden_size=256, learning_rate=0.01, n_epochs=30, device=device):
    code_rnn = CodeRNN(vocab.n_tokens, hidden_size, len(set(intents))).to(device)
    optimizer = torch.optim.SGD(code_rnn.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0

        for code_sequence, intent in zip(numbered_corpus, intents):
            optimizer.zero_grad()
            code_loss = 0
            hidden = code_rnn.initHidden(device)

            # Include intent information in the forward pass
            for i in range(len(code_sequence) - 1):
                input_tensor = torch.tensor([[code_sequence[i]]]).to(device)
                output, hidden = code_rnn.forward(input_tensor, hidden)
                target_tensor = torch.tensor([code_sequence[i + 1]]).view(-1).to(device)

            # Include intent information in the loss calculation
            intent_tensor = torch.tensor([intent]).view(-1).to(device)
            intent_loss = criterion(output, intent_tensor)
            code_loss += intent_loss

            total_loss += code_loss.data.item()
            # Calculate accuracy
            _, predicted = torch.max(output, 1)
            total_predictions += 1
            correct_predictions += (predicted == intent_tensor).sum().item()

            code_loss.backward()
            optimizer.step()

        accuracy = correct_predictions / total_predictions
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(numbered_corpus)}, Train Accuracy: {accuracy}")

    return code_rnn

In [None]:
import torch

# Load the trained model
trained_code_model = torch.load("/content/drive/MyDrive/dsg_model_1.pth")

def predict_intent(code_snippet, model, vocab, device=device):
    model.eval()
    with torch.no_grad():
        # Tokenize the code snippet and replace unknown tokens with "<unk>"
        code_tokens = [token if token in vocab.token2index else "<unk>" for token in code_snippet.split(' ')]
        code_sequence = [vocab.getIndex(token) for token in code_tokens]

        hidden = model.initHidden(device)
        output = None

        for i in range(len(code_sequence) - 1):
            input_tensor = torch.tensor([[code_sequence[i]]]).to(device)
            output, hidden = model.forward(input_tensor, hidden, vocab)

        if output is not None:
            _, predicted_intent_index = torch.max(output, 1)
            predicted_intent = vocab.index2token[predicted_intent_index.item()]
        else:
            predicted_intent = "Unknown"

    return predicted_intent

code_snippet = "pd.get_dummies(df)"
predicted_intent = predict_intent(code_snippet, trained_code_model, code_vocab, device=device)
print(f"Predicted Intent: {predicted_intent}")


Predicted Intent: Unknown
