Based on https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

In [203]:
import os
import re
import pandas as pd
import numpy as np
import glob
import string
from collections import Counter

In [None]:
from nltk import ngrams
from itertools import chain

import nltk
nltk.download('punkt_tab')


Loding Data

In [205]:
def get_path(filename):
    """
    Makes a list of all the paths that fit the search requirement
    
    :param filename: A regular expression that defines the search requirement for the filenames
    :return  Returns a list of all the pathnames
    """
    # place the movies folder in the same directory as this notebook
    current_directory = os.getcwd()

    # glob.glob() is a pattern-matching path finder, it searches for the reviews in the movies folder based on a Regular Expression
    paths = glob.glob(current_directory + '/songtexts/' + filename)
    
    if len(paths) == 0:
        print('Your file list is empty. The code looks for the folder '+current_directory+'/songtexts, but could not find it.')
    else: 
        print("You loaded: ", len(paths), "files")
    return paths

In [206]:
def load_data(pathset):
    """
    Loads the data into a dataframe
    
    :param pathset:  A list of paths
    :return  A dataframe with three columns: Path, Review (Text) and Label
    """
    # Files are named by sentiment (P for positive, N for negative)
    pattern = re.compile('song[0-9]*.txt')
    songtext = []
    df = pd.DataFrame(columns = ['Path', 'Songtext'])
    for path in pathset:
        if re.search(pattern, path):
            text = open(path, "r").read()
            songtext.append(text)
        else:
            text = open(path, "r").read()
            reviews.append(text)
    df['Path'] = pathset
    df['Songtext'] = songtext
    return df

In [None]:
paths = get_path('song[0-9]*.txt')
data = load_data(paths)
data.head()

### Tokenize

In [208]:
import nltk
from nltk.tokenize import word_tokenize

def nltk_tokenizer(text):

    tokenized_text = word_tokenize(text)
    print(tokenized_text)
    
    
    
    return tokenized_text

In [209]:

def tokenize_songs(songs):

    tokenized_songs = []

    for x in songs:
      
        
        translator = x.maketrans('', '', string.punctuation) 
        data = x.translate(translator)                       #remove the punctuation 
        data = data.lower()                                  #making the data all lower case
        tokenized = word_tokenize(data)
      
        tokenized_songs.append(tokenized)    
    
    return tokenized_songs


In [None]:
data['Toks'] = tokenize_songs(data['Songtext'])
data.head()

####Calculating the average sentence length

lengths = []
words = []

for x in data['Songtext']:
    sentence = x.split("\n")
   
    count_words = 0
    
    for y in sentence:
        
        length = len(y.split())
        lengths.append(length)
        #print(length)
        count_words += length
        
    words.append(count_words)
    
 
print(np.mean(words))      ##words per song
print(np.mean(lengths))    #words per sentence
print(np.sum(lengths))     #total words


In [211]:
def get_vocabulary(unigram_frequ):
    

    
    vocabulary = {words:freq for (words,freq) in unigram_frequ.items() if freq > 24}
  #  vocabulary = sorted(vocabulary.items(), key=lambda x: x[1], reverse = True)
    

        
    return vocabulary

def get_frequencies(ngrams):

    ngram_frequencies = Counter(ngrams)
    ngram_frequencies = dict(ngram_frequencies) 
    


    return ngram_frequencies



Making ngrams

In [212]:
unigrams = list(chain.from_iterable(data['Toks']))
ngrams = list(ngrams(unigrams, 6))

ngram_joined  = []

for x in ngrams:
    y = " ".join(x)

    ngram_joined.append(y)

    

Looking for the most frequent words in our vocabulary

In [None]:
unigram_frequencies = get_frequencies(unigrams)
vocabulary = get_vocabulary(unigram_frequencies)
vocabulary
sorted(vocabulary.items(), key=lambda x: x[1], reverse = True)


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

tokenizer = Tokenizer()
tokenizer.fit_on_texts(ngram_joined)
sequences = tokenizer.texts_to_sequences(ngram_joined)
                                         
voca_size = len(tokenizer.word_index) + 1

In [None]:
# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=voca_size)
seq_length = X.shape[1]

In [None]:
model = Sequential()
model.add(Embedding(voca_size, 150, input_length=seq_length))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150))
model.add(Dense(150, activation='relu'))
model.add(Dense(voca_size, activation='softmax'))
print(model.summary())

In [None]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.reset_states()
model.fit(X, y, batch_size=128, epochs=50)

This is the first word sequence of the song. 

In [226]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    result.append(in_text)
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
            
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [None]:
# generate new text
seed_text = "Christmas"   
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)
np.savetxt("Christmas_song.txt", [generated], fmt="%s")