## Generate New Laravel Code Using Recurrent Neural Network

In [0]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F


In [0]:
# open text file and read in data as `text`
with open('/content/full_rt_project.txt', 'r') as f:
    text = f.read()

In [93]:
len(text)


1846504

In [94]:
text[:10]

'<?php\n\nnam'

In [0]:
text = text[:len(text)//2]

In [0]:
def replace_with_empty(text, string):
    return text.replace(string, "")

In [0]:
text = replace_with_empty(text, "<?php")

In [98]:
print(text[:1000])



namespace App;

use Illuminate\Database\Eloquent\Model;

class Email extends Model
{
    protected $guarded = ['id'];
}


namespace App;

use Illuminate\Database\Eloquent\Model;

class Recommendation extends Model
{
    protected $guarded = ['id'];
}


namespace App;

use Illuminate\Database\Eloquent\Model;

class FundManager extends Model
{
    /**
     * Guard properties
     *
     * @var array
     **/
    protected $guarded = ['id'];
    
    /**
     * The table associated with the model. This is a Pivot Table!
     *
     * @var string
     */
    protected $table = 'fund_managers_pivot';
}


namespace App;

use Illuminate\Database\Eloquent\Model;

class ServiceRelation extends Model
{
    /**
     * The attributes that are mass assignable.
     *
     * @var array
     */
    protected $fillable = ['organization_id', 'subtype_id', 'service_id'];

    public $timestamps = false;

    /**
     * Retrive a single service
     * 
     * @return Relation 
     */
    public functi

In [0]:
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [100]:
encoded[:100]

array([30, 30, 35, 23, 62,  5, 90, 68, 23, 82,  5, 13, 91, 68, 68, 15, 30,
       30,  0, 90,  5, 13, 73, 48, 48,  0, 62, 97, 35, 23, 50,  5, 92, 17,
       23, 50, 23, 43, 23, 90,  5, 92, 64, 48, 40, 38,  0,  5, 35, 50, 92,
       14, 40, 88,  5, 48, 15, 30, 30, 82, 48, 23, 90, 90, 13, 64, 62, 23,
       97, 48, 13,  5, 49, 50,  5, 35, 88, 90, 13, 14, 40, 88,  5, 48, 30,
        3, 30, 13, 13, 13, 13, 68, 53, 40, 50,  5, 82, 50,  5, 88])

In [101]:
len(chars)

99

In [0]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [103]:
# check that the function works as expected
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


## Making training mini-batches


To train on this data, we also want to create mini-batches for training. We want our batches to be multiple sequences of some desired number of sequence steps. Considering a simple example, our batches would look like this:

In [0]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [0]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [106]:
# printing out the first 10 items in a sequence
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[30 30 35 23 62  5 90 68 23 82]
 [50 97 40 35 36 97 88 88  5 35]
 [13 13 13 13 13 26 30 13 13 13]
 [46 41 97 88 70 30 13 13 13 13]
 [40 35 12  5 53  1 97 82  5 11]
 [40 68  5 53 50 42 13 97 35 50]
 [62  5 53 65  5 10 30 63 63 63]
 [ 0 90  5 53 34  5 68 40 15 63]]

y
 [[30 35 23 62  5 90 68 23 82  5]
 [97 40 35 36 97 88 88  5 35 80]
 [13 13 13 13 26 30 13 13 13 13]
 [41 97 88 70 30 13 13 13 13 13]
 [35 12  5 53  1 97 82  5 11 30]
 [68  5 53 50 42 13 97 35 50 13]
 [ 5 53 65  5 10 30 63 63 63 32]
 [90  5 53 34  5 68 40 15 63 30]]


## Defining the network with PyTorch


In [107]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

Training on GPU!


In [0]:
class LaravelRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## Define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## Define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## Define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        ## Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)
        
        ## TPass through a dropout layer
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
        

## Time to train

The train function gives us the ability to set the number of epochs, the learning rate, and other parameters.

Below we're using an Adam optimizer and cross entropy loss since we are looking at character class scores as output. We calculate the loss and perform backpropagation, as usual!

A couple of details about training: 
>* Within the batch loop, we detach the hidden state from its history; this time setting it equal to a new *tuple* variable because an LSTM has a hidden state that is a tuple of the hidden and cell states.
* We use [`clip_grad_norm_`](https://pytorch.org/docs/stable/_modules/torch/nn/utils/clip_grad.html) to help prevent exploding gradients.

In [0]:
# change the name, for saving multiple files
def load_model(net):
    with open('laravel_rnn.net', 'rb') as f:
        checkpoint = torch.load(f)
    
    loaded = LaravelRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
    loaded.load_state_dict(checkpoint['state_dict'])
    return loaded

In [0]:
# change the name, for saving multiple files
def save_model(net, n_epochs):
    model_name = 'laravel_rnn.net'

    checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}
    print('Saving new model ...')
    with open(model_name, 'wb') as f:
        torch.save(checkpoint, f)

In [0]:
min_loss = float("inf")
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10, min_loss=min_loss):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            #print("MAIN OUTPUT: ", output.shape, output)
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                if (np.mean(val_losses) < min_loss):
                    min_loss = np.mean(val_losses)
                    save_model(net, epochs)
                    
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [115]:
# define and print the net
# n_hidden=512
# n_layers=2


# try for laravel core only of 78k chars
n_hidden=512
n_layers=2

net = LaravelRNN(chars, n_hidden, n_layers)
print(net)

LaravelRNN(
  (lstm): LSTM(99, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=99, bias=True)
)


In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [117]:
print(count_parameters(net))

3407459


In [118]:
batch_size = 64
seq_length = 20
n_epochs = 100 # start smaller if you are just testing initial behavior

# net = load_model(net)

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.0001, print_every=20, val_frac=0.1)

Saving new model ...
Epoch: 1/100... Step: 20... Loss: 4.4461... Val Loss: 4.4949
Saving new model ...
Epoch: 1/100... Step: 40... Loss: 3.5030... Val Loss: 3.7703
Saving new model ...
Epoch: 1/100... Step: 60... Loss: 3.3795... Val Loss: 3.6848
Saving new model ...
Epoch: 1/100... Step: 80... Loss: 3.2587... Val Loss: 3.6469
Epoch: 1/100... Step: 100... Loss: 3.2969... Val Loss: 3.6639
Saving new model ...
Epoch: 1/100... Step: 120... Loss: 3.2901... Val Loss: 3.6389
Saving new model ...
Epoch: 1/100... Step: 140... Loss: 3.3632... Val Loss: 3.6094
Saving new model ...
Epoch: 1/100... Step: 160... Loss: 3.3705... Val Loss: 3.5932
Saving new model ...
Epoch: 1/100... Step: 180... Loss: 3.3910... Val Loss: 3.5817
Saving new model ...
Epoch: 1/100... Step: 200... Loss: 3.3883... Val Loss: 3.5688
Saving new model ...
Epoch: 1/100... Step: 220... Loss: 3.3307... Val Loss: 3.5488
Saving new model ...
Epoch: 1/100... Step: 240... Loss: 3.2277... Val Loss: 3.5370
Saving new model ...
Epoch: 1

---
## Making Predictions

Now that the model is trained, we'll want to sample from it and make predictions about next characters! To sample, we pass in a character and have the network predict the next character. Then we take that character, pass it back in, and get another predicted character. Just keep doing this and you'll generate a bunch of text!

### A note on the `predict`  function

The output of our RNN is from a fully-connected layer and it outputs a **distribution of next-character scores**.

> To actually get the next character, we apply a softmax function, which gives us a *probability* distribution that we can then sample to predict the next character.

### Top K sampling

Our predictions come from a categorical probability distribution over all the possible characters. We can make the sample text and make it more reasonable to handle (with less variables) by only considering some $K$ most probable characters. This will prevent the network from giving us completely absurd characters while allowing it to introduce some noise and randomness into the sampled text. Read more about [topk, here](https://pytorch.org/docs/stable/torch.html#torch.topk).


In [0]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h

### Priming and generating text 

Typically you'll want to prime the network so you can build up a hidden state. Otherwise the network will start out generating characters at random. In general the first bunch of characters will be a little rough since it hasn't built up a long history of characters to predict from.

In [0]:
def sample(net, size, prime='function', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [126]:
net = load_model(net)
print(sample(net, 5000, prime='<?php', top_k=5))

<?php::class;
    }

    /**
     * Select an organization is boorgany an Investment by this on user
     */
    public function surveys()->section_number_objact_data)
    {
        // $user_id = $request->input('organization_id');
        $organization_id = $request->input('user_id');

        $response = $getOrganizationPath()->resull();
        $organization_id = $request->input('section_id');

        $user->organization_id = $organization->id : NULL);
        $results = $this->suvetyRepo->getInvestmentStags(), [
				'name' => $this->getSurveyAttachmentsWytementingBy:clans),
{
        $this->roundtableRepo->all();
        $organization = User::whereIn('email', $entiny);
    }

    public function removeContact(Request $request, $organization_id,
        Is as $this->inviter_organization_id]));

        route($survey_relation->thas();                    $this->event_medbades = (new ConkectRecommenderion();
            $suggested_investment->filter_data);
        }

        return re