# Neural nets generates text

## LSTM with 3 layers
 
This code is heavily modified from https://github.com/benjelloo/RapNet 
 
 ### Processing Lyrics Dataset

 dataprep.py
 
Perform data set preprocessing, including printing of text length, processing of punctuation marks, lowercase conversion of text, segmentation of text, and mapping of characters to integer indexes.
Used to prepare text data for subsequent neural network model training.

###  build a model and set the hyperparameters

model.py

The main purpose of this code is to define an RNN model, including the model structure and forward propagation operation, and provides the function of initializing the hidden state.

### Training

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time
import dataprep
import model

# Perform a single forward and backward pass
def fb_props(rnn,optimizer,criterion,inp,target,hidden,gpu_avail):
    rnn.zero_grad()
    if gpu_avail:
        inp,target = inp.cuda(),target.cuda()
    
    hidden = tuple([x.data for x in hidden])

    output,hidden = rnn.forward(inp,hidden)
    loss = criterion(output.squeeze(),target)
    loss.backward()
    nn.utils.clip_grad_norm_(rnn.parameters(),5)
    optimizer.step()

    return loss.item(),hidden

# Train the model over multiple epochs
def training_loop(rnn,batch_size,optimizer,criterion,n_epochs,gpu_avail):
    losses=[]
    rnn.train()
    t_start = time.time()
    for epoch in range(n_epochs):
        hidden = rnn.init_hidden_weights(batch_size,gpu_avail)
        for batch, (inputs,targets) in enumerate(train_loader,1):
            #ensure it's a full batch before props
            n_batches = len(train_loader.dataset)//batch_size
            if(batch > n_batches):
                break
            loss,hidden = fb_props(rnn,optimizer,criterion,inputs,targets,hidden,gpu_avail)
            losses.append(loss)
            
            if batch%100==0:
                t_end = time.time()
                print('Epoch {}/{} \t Loss: {} \t Progress: {}% \t Time Elapsed: {} minutes'.format(
                    epoch+1,
                    n_epochs,
                    np.average(losses),
                    (epoch*n_batches+batch)/(n_epochs*n_batches)*100,
                    (t_end-t_start)/60))
                losses=[]
    return rnn

# Pre-process and load the data
data_dir = '../Troye Sivan_Lyrics.txt'
seq_length = 32
batch_size = 128

v_to_i,i_to_v,text_nums = dataprep.data_processor(data_dir)
train_loader = dataprep.data_batcher(text_nums,seq_length,batch_size)


# Define and initialize the LSTM model and set hyperparameters
hypers = model.HyperParams(len(v_to_i), len(v_to_i), num_layers=3)  # Set num_layers to 3

net = model.Rnn(hypers.vocab_size, hypers.output_size, hypers.embedding_dim, hypers.hidden_dim, hypers.num_layers, hypers.dropout)
print(net)

net = model.Rnn(hypers.vocab_size,hypers.output_size,hypers.embedding_dim,hypers.hidden_dim,hypers.num_layers,hypers.dropout)
print(net)

#check for a gpu
if torch.cuda.is_available():
    print('GPU is available!')
    gpu_avail=True
    net.cuda()
else:
    print('GPU not found, will train on CPU!')
    gpu_avail=False

# An optimizer (Adam optimizer in this case) and a loss function (cross-entropy loss) are defined
optimizer = optim.Adam(net.parameters(),lr=hypers.learning_rate)
criterion = nn.CrossEntropyLoss()

# Start the training process
trained_model = training_loop(net,batch_size,optimizer,criterion,hypers.epochs,gpu_avail)

# save to a file
torch.save(trained_model.state_dict(),'../Neural-Net_layer3/trained_model.pt')
print('model successfully trained and saved!')



Rnn(
  (embedding): Embedding(2157, 256)
  (lstm): LSTM(256, 500, num_layers=3, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=500, out_features=2157, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Rnn(
  (embedding): Embedding(2157, 256)
  (lstm): LSTM(256, 500, num_layers=3, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=500, out_features=2157, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
GPU not found, will train on CPU!
Epoch 1/10 	 Loss: 5.699097857475281 	 Progress: 3.6101083032490973% 	 Time Elapsed: 0.5346979022026062 minutes
Epoch 1/10 	 Loss: 5.501242408752441 	 Progress: 7.2202166064981945% 	 Time Elapsed: 1.1332733710606893 minutes
Epoch 2/10 	 Loss: 5.161659518204166 	 Progress: 13.610108303249097% 	 Time Elapsed: 2.272845137119293 minutes
Epoch 2/10 	 Loss: 4.7849772262573245 	 Progress: 17.220216606498195% 	 Time Elapsed: 3.1383707880973817 minutes
Epoch 3/10 	 Loss: 4.340528424850292 	 Progress: 23.610108303249095% 	 Time Elapse

### Generate text



In [26]:
import model
import torch
import torch.nn.functional as F
import dataprep
import numpy as np

# Load and processe the text data
# Creating mappings between words and numerical indices.
data_dir = '../Troye Sivan_Lyrics.txt'
v_to_i, i_to_v, text_nums = dataprep.data_processor(data_dir)

# Initializes the hyperparameters and the trained model
hypers = model.HyperParams(len(v_to_i), len(v_to_i), num_layers=3)  # Set num_layers to 3
seq_length = 32


trained_net = model.Rnn(hypers.vocab_size, hypers.output_size, hypers.embedding_dim, hypers.hidden_dim, hypers.num_layers, hypers.dropout)
trained_net.load_state_dict(torch.load('../Neural-Net_layer3/trained_model.pt'))
trained_net.eval()

# Sets up the initial conditions for text generation, including the desired length of the generated text and the starting word.
gen_length = 600
start_word = "my"
start_word_ind = v_to_i[start_word]
generated = [start_word]

init_seq = np.full((1, seq_length), v_to_i['<newline>'])
init_seq[-1][-1] = start_word_ind

# Generate new words by looping
# In each loop, the next word is selected based on the model's output and probability distribution, and then the input sequence is updated.
gpu_avail = torch.cuda.is_available()
for x in range(gen_length):
    if gpu_avail:
        init_seq = torch.LongTensor(init_seq).cuda()
    else:
        init_seq = torch.LongTensor(init_seq)
    
    # Modify the way to initialize the hidden state to ensure compatibility with 3 LSTM layers
    hidden = trained_net.init_hidden_weights(init_seq.size(0), gpu_avail)
    output, _ = trained_net.forward(init_seq, hidden)
    probs = F.softmax(output, dim=1).data

    top_k = 10
    probs, top_inds = probs.topk(top_k)
    top_inds = top_inds.numpy().squeeze()
    probs =  probs.numpy().squeeze()
    chosen_word_ind = np.random.choice(top_inds, p=probs/probs.sum())
    generated.append(i_to_v[chosen_word_ind])

    init_seq = np.roll(init_seq, -1, 1)
    init_seq[-1][-1] = chosen_word_ind

# Joins the generated words into a single string and handles punctuation
generated = ' '.join(generated)
generated = dataprep.punctuation_handler(generated, for_gen=True)

try:
    f = open("../New lyrics/neural_net3_lyrics.txt", "a")
except:
    f = open("../New lyrics/neural_net3_lyrics.txt", "w")
f.write(generated)
f.close()

 ### postprocess the text
 
 This code is heavily modified from: https://github.com/vickyyyyyyy/lyrics-lstm/blob/main/preprocess.py

 Reference code(better_profanity): https://pypi.org/project/better-profanity/0.1/
 
 This code is mainly used to format the content in the text file and perform basic sensitive word filtering to improve the readability of the text and avoid inappropriate content.

 This code is mainly used to filter basic sensitive words in text files to improve the readability of the text and avoid inappropriate content.

 Split the text content, break lines when encountering punctuation marks, and correct capitalization and capitalization.

In [27]:
import re
import string
from better_profanity import profanity

def postprocess(lyrics, censored):
    # Capitalize the beginning of sentences
    sentence_case = re.compile(r'(?<=[.?!\n]\s)(\w+)|(^\w+)')
    lyrics = sentence_case.sub(lambda match: match.group().capitalize(), lyrics)
    # Finds a standalone lowercase 'i' and converts it to uppercase
    lyrics = re.sub(fr'\si[{string.punctuation}|\s]|\si$', lambda match: match.group().upper(), lyrics)
    # Remove extra spaces around punctuation marks
    lyrics = re.sub(fr" (?=[{string.punctuation}\n])|(?<=\n) ", "", lyrics)
    return profanity.censor(lyrics) if censored else lyrics

# Censorship of indecent terms
def process_file(file_path, censored=False):
    with open(file_path, 'r', encoding='utf-8') as file:
        lyrics = file.read()

    processed_lyrics = postprocess(lyrics, censored)

    with open('../New lyrics/neural_net_lyrics.txt', 'w', encoding='utf-8') as file:
        file.write(processed_lyrics)

file_path = '../New lyrics/neural_net_lyrics.txt'
process_file(file_path, censored=True)