Based on https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

In [103]:
import os
import re
import pandas as pd
import numpy as np
import glob
import string
from collections import Counter

In [104]:
from nltk import ngrams
from itertools import chain



Loding Data

In [105]:
def get_path(filename):
    """
    Makes a list of all the paths that fit the search requirement
    
    :param filename: A regular expression that defines the search requirement for the filenames
    :return  Returns a list of all the pathnames
    """
    # place the movies folder in the same directory as this notebook
    current_directory = os.getcwd()

    # glob.glob() is a pattern-matching path finder, it searches for the reviews in the movies folder based on a Regular Expression
    paths = glob.glob(current_directory + '/songtexts/' + filename)
    
    if len(paths) == 0:
        print('Your file list is empty. The code looks for the folder '+current_directory+'/songtexts, but could not find it.')
    else: 
        print("You loaded: ", len(paths), "files")
    return paths

In [106]:
def load_data(pathset):
    """
    Loads the data into a dataframe
    
    :param pathset:  A list of paths
    :return  A dataframe with three columns: Path, Review (Text) and Label
    """
    # Files are named by sentiment (P for positive, N for negative)
    pattern = re.compile('song[0-9]*.txt')
    songtext = []
    df = pd.DataFrame(columns = ['Path', 'Songtext'])
    for path in pathset:
        if re.search(pattern, path):
            text = open(path, "r").read()
            songtext.append(text)
        else:
            text = open(path, "r").read()
            reviews.append(text)
    df['Path'] = pathset
    df['Songtext'] = songtext
    return df

In [107]:
paths = get_path('song[0-9]*.txt')
data = load_data(paths)
data.head()

You loaded:  100 files


Unnamed: 0,Path,Songtext
0,c:\Users\anouk\Documents\Git-repositories\chri...,Chestnuts roasting on an open fire\nJack Frost...
1,c:\Users\anouk\Documents\Git-repositories\chri...,It was Christmas Eve babe\nIn the drunk tank\n...
2,c:\Users\anouk\Documents\Git-repositories\chri...,\nDeck the halls with boughs of holly\nFa-la-l...
3,c:\Users\anouk\Documents\Git-repositories\chri...,Christmas time is here\nHappiness and cheer\nF...
4,c:\Users\anouk\Documents\Git-repositories\chri...,Santa tell me if you're really there\nDon't ma...


### Tokenize

In [108]:
import nltk
from nltk.tokenize import word_tokenize

def nltk_tokenizer(text):

    tokenized_text = word_tokenize(text)
    print(tokenized_text)
    
    
    
    return tokenized_text

In [109]:
from nltk.tokenize import sent_tokenize

def tokenize_songs(songs):

    tokenized_songs = []

    for x in songs:
      
        
        translator = x.maketrans('', '', string.punctuation) 
        data = x.translate(translator)                       #remove the punctuation 
        data = data.lower()                                  #making the data all lower case
        print(data);
        tokenized = word_tokenize(x)
      
        tokenized_songs.append(tokenized)    
    
    return tokenized_songs


In [110]:
data['Toks'] = tokenize_songs(data['Songtext'])
data.head()

####Calculating the average sentence length

lengths = []
words = []

for x in data['Songtext']:
    sentence = x.split("\n")
   
    count_words = 0
    
    for y in sentence:
        
        length = len(y.split())
        lengths.append(length)
        #print(length)
        count_words += length
        
    words.append(count_words)
    
 
print(np.mean(words))      ##words per song
print(np.mean(lengths))    #words per sentence
print(np.sum(lengths))     #total words


chestnuts roasting on an open fire
jack frost nipping at your nose
yuletide carols being sung by a choir
and folks dressed up like eskimos

everybody knows a turkey and some mistletoe
help to make the season bright
tiny tots with their eyes all aglow
will find it hard to sleep tonight
they know that santas on his way
hes loaded lots of toys and goodies on his sleigh
and every mothers child is gonna spy
to see if reindeer really know how to fly

and so im offering this simple phrase
to kids from one to ninetytwo
although its been said many times many ways
merry christmas to you

and so im offering this simple phrase
to kids from one to ninetytwo
although its been said many times many ways
merry christmas to you



LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\anouk/nltk_data'
    - 'c:\\Python311\\nltk_data'
    - 'c:\\Python311\\share\\nltk_data'
    - 'c:\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\anouk\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [12]:
def get_vocabulary(unigram_frequ):
    

    
    vocabulary = {words:freq for (words,freq) in unigram_frequ.items() if freq > 24}
  #  vocabulary = sorted(vocabulary.items(), key=lambda x: x[1], reverse = True)
    

        
    return vocabulary

def get_frequencies(ngrams):

    ngram_frequencies = Counter(ngrams)
    ngram_frequencies = dict(ngram_frequencies) 
    


    return ngram_frequencies



Making ngrams

In [13]:
unigrams = list(chain.from_iterable(data['Toks']))
ngrams = list(ngrams(unigrams, 6))

ngram_joined  = []

for x in ngrams:
    y = " ".join(x)

    ngram_joined.append(y)

    

Looking for the most frequent words in our vocabulary

In [47]:
unigram_frequencies = get_frequencies(unigrams)
vocabulary = get_vocabulary(unigram_frequencies)
vocabulary
sorted(vocabulary.items(), key=lambda x: x[1], reverse = True)


[('the', 1166),
 ('and', 621),
 ('a', 544),
 ('christmas', 530),
 ('to', 491),
 ('you', 486),
 ('i', 368),
 ('on', 328),
 ('in', 308),
 ('of', 296),
 ('is', 253),
 ('all', 232),
 ('me', 210),
 ('oh', 197),
 ('for', 193),
 ('be', 193),
 ('this', 192),
 ('it', 187),
 ('with', 181),
 ('my', 171),
 ('santa', 167),
 ('its', 163),
 ('that', 156),
 ('merry', 146),
 ('so', 139),
 ('we', 131),
 ('day', 123),
 ('love', 120),
 ('but', 119),
 ('come', 114),
 ('time', 110),
 ('your', 109),
 ('he', 108),
 ('year', 103),
 ('claus', 90),
 ('here', 85),
 ('know', 82),
 ('have', 81),
 ('if', 79),
 ('when', 79),
 ('bells', 78),
 ('are', 78),
 ('na', 77),
 ('let', 77),
 ('just', 77),
 ('will', 75),
 ('what', 74),
 ('night', 73),
 ('up', 70),
 ('yeah', 70),
 ('his', 69),
 ('im', 69),
 ('no', 69),
 ('out', 69),
 ('snow', 69),
 ('good', 68),
 ('dont', 67),
 ('down', 67),
 ('like', 66),
 ('one', 66),
 ('as', 66),
 ('see', 64),
 ('was', 64),
 ('him', 63),
 ('go', 63),
 ('run', 63),
 ('pum', 63),
 ('wish', 62),

In [188]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

tokenizer = Tokenizer()
tokenizer.fit_on_texts(ngram_joined)
sequences = tokenizer.texts_to_sequences(ngram_joined)
                                         
voca_size = len(tokenizer.word_index) + 1

In [189]:
# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=voca_size)
seq_length = X.shape[1]

In [223]:
model = Sequential()
model.add(Embedding(voca_size, 150, input_length=seq_length))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150))
model.add(Dense(150, activation='relu'))
model.add(Dense(voca_size, activation='softmax'))
print(model.summary())

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 5, 150)            374400    
_________________________________________________________________
lstm_24 (LSTM)               (None, 5, 150)            180600    
_________________________________________________________________
lstm_25 (LSTM)               (None, 150)               180600    
_________________________________________________________________
dense_24 (Dense)             (None, 150)               22650     
_________________________________________________________________
dense_25 (Dense)             (None, 2496)              376896    
Total params: 1,135,146
Trainable params: 1,135,146
Non-trainable params: 0
_________________________________________________________________
None


In [225]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.reset_states()
model.fit(X, y, batch_size=128, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x2323dd05f40>

This is the first word sequence of the song. 

In [226]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    result.append(in_text)
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
            
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [229]:
# generate new text
seed_text = "Christmas"   
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)
np.savetxt("Christmas_song.txt", [generated], fmt="%s")

Christmas grandpa forever oh night the bells in we shine peace war is all now i have about the poor little baby cant he want for christmas the things your eyes made slowly horse whisper and what singing the christmas now the very and just really good to forgot the little
