# Taylor Swift Lyric Generation

In [76]:
import numpy as np
import tensorflow as tf
import pandas as pd
from keras.preprocessing.text import Tokenizer
import re

## Importing the data

In [77]:
lyrics_dataframe = pd.read_csv("taylor_swift_lyrics.csv")
lyrics_dataframe

Unnamed: 0,Artist,Album,Song,Track_Number,Lyrics,Line_Number,Year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006
1,Taylor Swift,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006
2,Taylor Swift,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3,2006
3,Taylor Swift,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4,2006
4,Taylor Swift,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5,2006
...,...,...,...,...,...,...,...
4857,Taylor Swift,reputation,New Year's Day,15,"(Hold on to the memories, they will hold on to...",43,2017
4858,Taylor Swift,reputation,New Year's Day,15,Please don't ever become a stranger,44,2017
4859,Taylor Swift,reputation,New Year's Day,15,"(Hold on to the memories, they will hold on to...",45,2017
4860,Taylor Swift,reputation,New Year's Day,15,Whose laugh I could recognize anywhere,46,2017


## Initial Data Exploration and Preprocessing
Lets poke around at the data and learn about. One of the first things we should probably figure out is how many albums and songs we are looking at

In [78]:
# Counting the number of albums
num_albums = lyrics_dataframe["Album"].drop_duplicates().size
# Counting the number of songs
num_songs = lyrics_dataframe["Song"].drop_duplicates().size
print("Number of albums : " + str(num_albums))
print("Number of songs  : " + str(num_songs))

Number of albums : 6
Number of songs  : 94


Before we go father, it might be useful to preprocess the lyrics with a function

In [90]:
# let's first build a preprocessing function which we can make adjustments to later
def preprocessLyric(line):
    # Set the line to lowercase
    line = line.lower()
    # Get rid of the parenthesis
    parenthesisopen = re.findall('\(',line)
    parenthesisclose = re.findall('\)',line)
    for i in parenthesisopen:
        line = line.replace(i, "")
    for i in parenthesisclose:
        line = line.replace(i,"")
    # Get rid of quotation marks and apostrophes
    quotes = re.findall('\"', line)
    apostrophes = re.findall("\'", line)
    for i in quotes:
        line = line.replace(i,"")
    for i in apostrophes:
        line = line.replace(i,"")
    # Make sure there is a space between words and punctuation so they dont get combined
    punc = re.findall("!,?.,", line)
    for i in punc:
        line = line.replace(i," " + i[0] + " ")
    # Just for sanity check let's lastly get rid of the double spaces
    line = re.sub("\s\s", " ", line)
    return line

In [91]:
# Now let's apply the function to the lyrics
lyrics_dataframe["Lyrics"] = lyrics_dataframe["Lyrics"].apply(preprocessLyric)
# Let's see our changes
lyrics_dataframe

Unnamed: 0,Artist,Album,Song,Track_Number,Lyrics,Line_Number,Year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,he said the way my blue eyes shined,1,2006
1,Taylor Swift,Taylor Swift,Tim McGraw,1,put those georgia stars to shame that night,2,2006
2,Taylor Swift,Taylor Swift,Tim McGraw,1,"i said, thats a lie",3,2006
3,Taylor Swift,Taylor Swift,Tim McGraw,1,just a boy in a chevy truck,4,2006
4,Taylor Swift,Taylor Swift,Tim McGraw,1,that had a tendency of gettin stuck,5,2006
...,...,...,...,...,...,...,...
4857,Taylor Swift,reputation,New Year's Day,15,"hold on to the memories, they will hold on to you",43,2017
4858,Taylor Swift,reputation,New Year's Day,15,please dont ever become a stranger,44,2017
4859,Taylor Swift,reputation,New Year's Day,15,"hold on to the memories, they will hold on to you",45,2017
4860,Taylor Swift,reputation,New Year's Day,15,whose laugh i could recognize anywhere,46,2017
