In [None]:
''' adapted from tutorial from keras offical documentation on LSTM sequence to sequence models.'''
# from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymongo
from pymongo import MongoClient
from keras.utils import Sequence
from keras.utils.np_utils import to_categorical
from sklearn.utils import shuffle

%load_ext autoreload
%autoreload 2

IP_ADDRESS = '13.58.253.233'

max_code_length = 300
max_examples = -1
df = pd.read_pickle('train_df.pkl')
df = shuffle(df)
df=df.iloc[0:max_examples, :]
df=df[df['cpp_code'].map(lambda x : len(x))<max_code_length - 1]
df=df[df['python_code'].map(lambda x : len(x))<max_code_length - 1]
len(df)

In [45]:
# We use "«" as the "start sequence" character
# for the targets, and "»" as "end sequence" character.
start_token = '\xAB'
end_token = '\xBB'
print(start_token,end_token)

batch_size = 256  # Batch size for training.
latent_dim = 256  # Latent dimensionality of the encoding space.

# Vectorize the data.
input_texts = []
target_texts = []

for _,row in df.iterrows():
    input_text = row.python_code
    target_text = row.cpp_code
    target_text = start_token + target_text + end_token
    input_texts.append(input_text)
    target_texts.append(target_text)


input_characters = set([chr(i) for i in range(128)])
target_characters = set([chr(i) for i in range(128)]+[start_token,end_token])




input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max_code_length
max_decoder_seq_length = max_code_length

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

« »
Number of samples: 448
Number of unique input tokens: 128
Number of unique output tokens: 130
Max sequence length for inputs: 200
Max sequence length for outputs: 200


In [40]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

In [41]:
target_token_index

{'\x00': 0,
 '\x01': 1,
 '\x02': 2,
 '\x03': 3,
 '\x04': 4,
 '\x05': 5,
 '\x06': 6,
 '\x07': 7,
 '\x08': 8,
 '\t': 9,
 '\n': 10,
 '\x0b': 11,
 '\x0c': 12,
 '\r': 13,
 '\x0e': 14,
 '\x0f': 15,
 '\x10': 16,
 '\x11': 17,
 '\x12': 18,
 '\x13': 19,
 '\x14': 20,
 '\x15': 21,
 '\x16': 22,
 '\x17': 23,
 '\x18': 24,
 '\x19': 25,
 '\x1a': 26,
 '\x1b': 27,
 '\x1c': 28,
 '\x1d': 29,
 '\x1e': 30,
 '\x1f': 31,
 ' ': 32,
 '!': 33,
 '"': 34,
 '#': 35,
 '$': 36,
 '%': 37,
 '&': 38,
 "'": 39,
 '(': 40,
 ')': 41,
 '*': 42,
 '+': 43,
 ',': 44,
 '-': 45,
 '.': 46,
 '/': 47,
 '0': 48,
 '1': 49,
 '2': 50,
 '3': 51,
 '4': 52,
 '5': 53,
 '6': 54,
 '7': 55,
 '8': 56,
 '9': 57,
 ':': 58,
 ';': 59,
 '<': 60,
 '=': 61,
 '>': 62,
 '?': 63,
 '@': 64,
 'A': 65,
 'B': 66,
 'C': 67,
 'D': 68,
 'E': 69,
 'F': 70,
 'G': 71,
 'H': 72,
 'I': 73,
 'J': 74,
 'K': 75,
 'L': 76,
 'M': 77,
 'N': 78,
 'O': 79,
 'P': 80,
 'Q': 81,
 'R': 82,
 'S': 83,
 'T': 84,
 'U': 85,
 'V': 86,
 'W': 87,
 'X': 88,
 'Y': 89,
 'Z': 90,
 '[': 91,


In [42]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [43]:
# Define the model that will turn
# # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# from keras.models import load_model
# model = load_model('s2s.h5')

In [46]:
max_epochs = 100  # Number of epochs to train for.

for epoch in range(max_epochs):
    print(f"Epoch: {epoch}/{max_epochs}")
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
              batch_size=batch_size,
              epochs=1,
              validation_split=0.05)
    
    # Save model
    model.save('s2s.h5')
    
    # Next: inference mode (sampling).
    # 1) encode input and retrieve initial decoder state
    # 2) run one step of decoder with this initial state
    # and a "start of sequence" token as target.
    # Output will be the next target token
    # 3) Repeat with the current target token and current states

    # Define sampling models
    encoder_model = Model(encoder_inputs, encoder_states)

    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)

    # Reverse-lookup token index to decode sequences back to
    # something readable.
    reverse_input_char_index = dict(
        (i, char) for char, i in input_token_index.items())
    reverse_target_char_index = dict(
        (i, char) for char, i in target_token_index.items())


    def decode_sequence(input_seq):
        # Encode the input as state vectors.
        states_value = encoder_model.predict(input_seq)

        # Generate empty target sequence of length 1.
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        # Populate the first character of target sequence with the start character.
        target_seq[0, 0, target_token_index[start_token]] = 1.

        # Sampling loop for a batch of sequences
        # (to simplify, here we assume a batch of size 1).
        stop_condition = False
        decoded_sentence = ''
        while not stop_condition:
            output_tokens, h, c = decoder_model.predict(
                [target_seq] + states_value)

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_char = reverse_target_char_index[sampled_token_index]
            decoded_sentence += sampled_char

            # Exit condition: either hit max length
            # or find stop character.
            if (sampled_char == end_token or
               len(decoded_sentence) > max_decoder_seq_length):
                stop_condition = True

            # Update the target sequence (of length 1).
            target_seq = np.zeros((1, 1, num_decoder_tokens))
            target_seq[0, 0, sampled_token_index] = 1.

            # Update states
            states_value = [h, c]

        return decoded_sentence


    for seq_index in range(10):
        # Take one sequence (part of the training set)
        # for trying out decoding.
        input_seq = encoder_input_data[seq_index: seq_index + 1]
        decoded_sentence = decode_sequence(input_seq)
        print('---')
        print('Input python program: \n'+input_texts[seq_index])
        print('-')
        print('Decoded C++ program:\n'+decoded_sentence)
    

Epoch: 0/100
Train on 425 samples, validate on 23 samples
Epoch 1/1


  '. They will not be included '


---
Input python program: 
n = input()
su=0
n=int(n)
s1=input()
B = [int(x) for x in s1.split(' ')]
b=sum(B)
print(b)
    
-
Decoded C++ program:
#iiiinin                                                                                                                                                                                                 
---
Input python program: 
n = int(input())
arr = [0] * n
arr = input().split()
for i in range(0,len(arr)):
    arr[i] = int(arr[i])
arr.sort()
print(arr[len(arr)//2])
-
Decoded C++ program:
#iiiinin                                                                                                                                                                                                 
---
Input python program: 
input()
counts = [0 for i in range(100)]

for i in input().split():
    i = int(i)
    counts[i] += 1

print(" ".join([str(i) for i in counts]))
-
Decoded C++ program:
#iiiinin                                                        

---
Input python program: 
input()
counts = [0 for i in range(100)]

for i in input().split():
    i = int(i)
    counts[i] += 1

print(" ".join([str(i) for i in counts]))
-
Decoded C++ program:
#iinnnuueeiii                                                                                                                                                                                            
---
Input python program: 
n = int(input())
l = list(map(int,input().split()))
sum = 0
for i in l:
    sum+=i
print(sum)
-
Decoded C++ program:
#iinnnuueeiii                                                                                                                                                                                            
---
Input python program: 
n=int(input())
a=list(map(int,input().split()))
p=max(a)
dic={i:0 for i in range(p+1)}
for i in a:
    dic[i]=dic[i]+1
for i in dic:
    print(dic[i],end=" ")
-
Decoded C++ program:
#iinnnuueeiii                                   

KeyboardInterrupt: 