# CS510 NLP Final Project

Author - Truman Daniels

**bold text**## Libraries

In [None]:
%%capture
import pandas as pd
import tensorflow as tf
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Bidirectional
from tensorflow.keras.utils import to_categorical
import sys, os # Importing data

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Load Data

In [None]:
%%capture

# downloads the .csv files from google drive only if it's not already in directory
path = "/content/spotify_millsongdata.csv"
if os.path.isfile(path) == False:
  !gdown --id 1wGtLywxyCq858JTVtizWHR5dtIf4Di8v

df = pd.read_csv(path)

### Only grab Rihanna songs

In [None]:
df_rih = df[df["artist"] == "Rihanna"]
df_rih = df_rih.reset_index(drop=True)
# df_rih = df_rih[df_rih["song"] != "A Child Is Born"] #drop this row
# The number of songs
print(len(df_rih))
# df_rif['text']
df_rih.head()

143


Unnamed: 0,artist,song,link,text
0,Rihanna,A Child Is Born,/r/rihanna/a+child+is+born_20897863.html,As I was walkin' down the road to Bethlehem on...
1,Rihanna,A Girl Like Me,/r/rihanna/a+girl+like+me_20409523.html,Some girls play the game \r\nThey all walk an...
2,Rihanna,Afterparty,/r/rihanna/afterparty_20901270.html,"Mc, Nicki, Riri \r\nAfter Party \r\n \r\nTu..."
3,Rihanna,American Oxygen,/r/rihanna/american+oxygen_21096458.html,"[Chorus] \r\nBreathe out, breathe in \r\nAme..."
4,Rihanna,California King Bed,/r/rihanna/california+king+bed_20910592.html,Chest to chest \r\nNose to nose \r\nPalm to ...


In [None]:
!pip install nltk
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

tokens = df_rih['text'].apply(word_tokenize)
total_tokens = tokens.apply(len).sum()
print(f"Total number of tokens: {total_tokens}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Total number of tokens: 59089


In [None]:
# Initialize variables to track the longest song
longest_song_length = 0
longest_song_index = None

# Iterate through each row in the 'text' column
for index, row in df_rih.iterrows():
    # Calculate the length of the text
    song_length = len(row['text'])

    # Update the longest song info if this song is longer
    if song_length > longest_song_length:
        longest_song_length = song_length
        longest_song_index = index

# Print the length of the longest song and its index
print("Longest song length:", longest_song_length)
print("Index of the longest song:", longest_song_index)

Longest song length: 3765
Index of the longest song: 90


In [None]:
df_rih['song'].iloc[90]

'Break It Off'

In [None]:
import plotly.express as px
def display_token_length_distribution(df):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['text'])
    sequences = tokenizer.texts_to_sequences(df['text'])
    token_lengths = [len(seq) for seq in sequences]
    token_length_df = pd.DataFrame({'token_length': token_lengths})
    fig = px.histogram(token_length_df, x='token_length', nbins=60, title='Distribution of Token Lengths')
    fig.update_layout(bargap=0.1)
    fig.update_xaxes(title_text='Length of Song (Number of Tokens)')
    fig.update_yaxes(title_text='Frequency')

    # Show the plot
    fig.show()

# Example usage with your DataFrame 'df_rih'
display_token_length_distribution(df_rih)

In [None]:
df_rih['text'].iloc[90]

"Breaking it off and setting it off in the real way  \r\nMaking the girls, them chill their mind (feel fine)  \r\nMaking them have a good time  \r\nYeah, man, S Peezy, yo, long side Ri, Ri  \r\nCome down now, Rihanna  \r\nTake it to them, take it to them, girl  \r\n  \r\nBreak it off, boy, this has got me feeling naughty  \r\nI wanna know, boy, if I can be your shorty (most definitely)  \r\nSet it off, boy, make me hot all over my body  \r\n(Break it off, take it off, no miss take it off)  \r\nBreak it off tonight (Yo)  \r\nBreak it off, boy, this has got me feeling naughty  \r\nI wanna know, boy, if I can be your shorty (yo, yo)  \r\nSet it off, boy, make me hot all over my body (yeah, yeah)  \r\nBreak it off tonight (yo, yo)  \r\n  \r\nStruggle and pain wha she fell all de while  \r\nAnd she want a good man to give her de style  \r\nWild child  \r\nSo she dere pon me file  \r\nIt's a long time now me wha plow  \r\nPlow de sile meh car run, run like de nile  \r\nAnd de most energy meh

### Data Preprocessing

In [None]:
# Take all of Rihanna's songs, concatenate their text together.
text_data = df_rih['text'].str.cat(sep=' ')

# OOV means "out of vocab"
oov_tok = '<OOV>'
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\r', oov_token=oov_tok) #filter out punctuation and lower, notably keeping nextline char \n
tokenizer.fit_on_texts([text_data])
total_words = len(tokenizer.word_index) + 1

# Split into each lyric line
input_sequences = []
for line in text_data.split('\n'):
    token_list = tokenizer.texts_to_sequences([line+'\n'])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1] #ngram 1
        #n_gram_sequence = token_list[i-1:i+1] #ngram 2
        #n_gram_sequence = token_list[i-2:i+1] #ngram 3
        input_sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Split into X and y where y is the next word in a sequence and X is all the words leading up until that word
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

In [None]:
input_sequences[:, -1]

array([280, 157,   4, ...,  36,   2,   2], dtype=int32)

In [None]:
word_index = tokenizer.word_index
word_index["\n"]

2

### Train Model

In [None]:
# Model Architecture
model = Sequential()
model.add(Embedding(total_words, 256, input_length=max_sequence_length-1))
model.add(Bidirectional(SimpleRNN(256)))
model.add(Dense(total_words, activation='softmax'))

# Training
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=10, verbose=2, batch_size=64, validation_split=0.05, shuffle=False)

Epoch 1/10
750/750 - 99s - loss: 5.7518 - accuracy: 0.1458 - val_loss: 6.1079 - val_accuracy: 0.1391 - 99s/epoch - 132ms/step
Epoch 2/10
750/750 - 91s - loss: 4.9887 - accuracy: 0.1820 - val_loss: 6.0014 - val_accuracy: 0.1014 - 91s/epoch - 121ms/step
Epoch 3/10
750/750 - 92s - loss: 4.5472 - accuracy: 0.2137 - val_loss: 6.0281 - val_accuracy: 0.1446 - 92s/epoch - 123ms/step
Epoch 4/10
750/750 - 91s - loss: 4.1938 - accuracy: 0.2406 - val_loss: 6.0508 - val_accuracy: 0.1541 - 91s/epoch - 122ms/step
Epoch 5/10
750/750 - 94s - loss: 3.9593 - accuracy: 0.2612 - val_loss: 6.3237 - val_accuracy: 0.1589 - 94s/epoch - 125ms/step
Epoch 6/10
750/750 - 91s - loss: 3.5434 - accuracy: 0.3083 - val_loss: 6.2957 - val_accuracy: 0.1628 - 91s/epoch - 121ms/step
Epoch 7/10
750/750 - 91s - loss: 3.2389 - accuracy: 0.3490 - val_loss: 6.3732 - val_accuracy: 0.1660 - 91s/epoch - 121ms/step
Epoch 8/10
750/750 - 93s - loss: 3.0025 - accuracy: 0.3812 - val_loss: 6.4136 - val_accuracy: 0.1815 - 93s/epoch - 124

<keras.src.callbacks.History at 0x7c1276be9ba0>

### Generate New Text

In [None]:
def smart_capitalize(sentence):
    """
    via chatgpt
    # Example usage:
    input_text = "hello world! i'm an example.\ni've seen it all."
    output_text = smart_capitalize(input_text)
    print(output_text)
    Hello World! I'm an example.\nI've seen it all."
    """
    sentences = re.split(r'([.!\n])', sentence)
    capitalize_next = True

    for i in range(len(sentences)):
        if sentences[i] in ['.', '!', '\n', ' \n']:
            capitalize_next = True
        elif capitalize_next:
            sentences[i] = sentences[i].capitalize()
            capitalize_next = False

    output = ''.join(sentences)
    dictionary = {"i've":"I've", "i'm":"I'm", "i'll": "I'll", " i ":" I "}
    for key in dictionary.keys():
        output = output.replace(key, dictionary[key])
    return output

def generate_text(seed_text, num_words, model, max_sequence_length):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]

        # Sample the word based on predicted probabilities
        predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
        output_word = tokenizer.index_word.get(predicted_index, '')
        seed_text += " " + output_word
    return seed_text

In [None]:
repr(smart_capitalize(generate_text("Take On Me", 100, model, 2)))

'"Take on me said i\'d live without you gon burn and sign me felt gonna please \\nFooled carriage \\n heat amo someone break he beats story proud to fight a story huh \\n get \\nVery sugar show me believe me knows better check stock \\nRed I should\'ve kick them traffic stressy \\n save invited \\n evil diamonds from wildin\' \\n someone take \'em light you stick from between me mean it kills heaven \\n piece icicle \\n gon live yesterday \\n stockings \\n beyond line whatever we\'d ever wanted to stick from compelling me standing on me that deserved \\n where heaven \\n"'

In [None]:
repr(smart_capitalize(generate_text("YMCA", 100, model, 2)))

'"Ymca inspiration \\n maintain \\n ay xiii come alive in central station you wa excuse me was using him push nurse top \\nBette \\nGyal stole least long power about it takes a fortress you\'ve cap righteous covers the buried adore you enticed \\n disguise \\n wlong \\n torn cities \\n taste and end plain stars like penelope \\n shockin \\n \\nBad dj say it feels like engine\'s settling \\n voice full blow issues your pride all wrapped different \\n ray round chicks \\n hush \\n turn me hole entertainment ends mo\' mr dj dream like obama am yea \\n that\'s what"'

### Example Outputs

Take on me said i'd live without you gon burn and sign me felt gonna please

Fooled carriage

heat amo someone break he beats story proud to fight a story huh \n get \nVery sugar show me believe me knows better check stock \nRed I should've kick them traffic stressy \n save invited \n evil diamonds from wildin' \n someone take 'em light you stick from between me mean it kills heaven \n piece icicle \n gon live yesterday \n stockings \n beyond line whatever we'd ever wanted to stick from compelling me standing on me that deserved \n where heaven \n"

"Take on me said i'd live without you gon burn and sign me felt gonna please \nFooled carriage \n heat amo someone break he beats story proud to fight a story huh \n get \nVery sugar show me believe me knows better check stock \nRed I should've kick them traffic stressy \n save invited \n evil diamonds from wildin' \n someone take 'em light you stick from between me mean it kills heaven \n piece icicle \n gon live yesterday \n stockings \n beyond line whatever we'd ever wanted to stick from compelling me standing on me that deserved \n where heaven \n"

### Sources:
*   chatgpt
*   https://towardsdatascience.com/word-and-character-based-lstms-12eb65f779c2

In [None]:
%%capture
!sudo apt-get update
!sudo apt-get install texlive-xetex texlive-fonts-recommended

In [None]:
!jupyter nbconvert --to pdf "/content/drive/MyDrive/NLPGroupProject/RNN_baseline_final.ipynb"

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr