In [28]:
#Import all necessary packages
import numpy as np
import pandas as pd 
import re
from sklearn.feature_extraction import _stop_words as stop_words 
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ROG\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


# Week 4 Lecture
## Word Importance and Topic Modelling

### TF/IDF

Up until this point, we've seen how counting words, and looking at the most frequent can gives us some insight into a single document. If we want to start comparing documents with more certainty, or getting smarter about our representations, we can try and get a set of numbers for each documents that not only represents **word frequency**, but also **word importance**. 

What we will end up with is a measurement called **TF/IDF** or **T**erm **F**requency x **I**nverse **D**ocument **F**requency. 

### TF

**TF** stands for **term frequency** and we've been using it a lot already in our Bags of Words. By itself, it tells us how many times a particular term appears in a document. Can we do better?


In [30]:
import re
fs = open('../data/hacking.txt', 'r') 
# Remember that "../" gets us up 1 directory, looking for data in the parent directory of this file
book = fs.read()

In [31]:
#A reasonably good tokeniser for this task
def my_tokeniser(doc):
    #Split on spaces
    tokens = re.split(r'[-\s.,;!?]+', doc)
    return [lem.lemmatize(t.lower()) for t in tokens if not t in stop_words.ENGLISH_STOP_WORDS]

In [32]:
#Using the CountVectorizer to get a bag of words using our tokeniser above
count_vectoriser = CountVectorizer(tokenizer=my_tokeniser) #Create a new object capable of computing word counts from a set of documents, specifying that we want to apply my_tokeniser to the document first


#fit_transform first identifies the full vocabulary to be used (i.e. "fits" the documents), then computes the word (token) count for every document, for every word in the vocabulary (i.e., "transforms").
#Here we are calling this on an array of documents containing just one element, our book document:
bag_of_words = count_vectoriser.fit_transform([book]) 

#bag_of_words is now a matrix, with one row per document and one column per token in the vocabulary. 
#But it's stored as a special type of object (for efficiency purposes, since it could be big and sparse) and if we want to work with it like a regular matrix/2D array, we need to call .todense() first

#Let's print out the shape of our matrix: We should see that it's got 1 row (because we've got 1 document) and 11,770 terms:
print(bag_of_words.todense().shape)

(1, 11770)




In [33]:
# We can get a human-readable list of words/terms in our vocabulary from count_vectoriser like this:
vocab = count_vectoriser.get_feature_names_out()

#Now we're going to use a special data structure called a DataFrame, from the pandas library, which we've imported as pd above
#DataFrames allow us a really human-friendly representation of matrices, where we can give names to rows and columns, print them on screen, grab certain rows/columns/elements by name or index, and more!
# This code creates a DataFrame from our bag_of_words matrix, specifying that the column names come from the vocab (in the same order as they appear in our vocab)
bag_of_words_df = pd.DataFrame(bag_of_words.todense(), columns = vocab)
bag_of_words_df

Unnamed: 0,Unnamed: 1,"""","""'","""/bin/login""'","""abcdef""","""addict""","""another","""anthrax","""asdfgh""","""because",...,zionist',zip,zone,zone',zonked,zoo,zoom,zoomed,zx81,~daemon
0,1,15,6,1,1,1,1,1,1,1,...,1,1,1,1,2,2,2,3,1,1


In [34]:
#Show the 20 most common words (we've already removed STOP WORDS)

#How does this work?
# .iloc[0] grabs row 0 from the dataframe, which corresponds to our (only) book document
# .sort_values() sorts the values in this row from lowest to highest
# [-20:] will grab the last 20 elements (i.e., the 20 highest values) in this list
# And these get printed to the screen because it's the last line in this cell.

bag_of_words_df.iloc[0].sort_values()[-20:]

machine      249
people       277
phone        289
network      297
worm         298
hacking      301
day          303
mendax       307
police       323
anthrax      338
phoenix      361
just         361
like         367
didn't       378
electron     383
time         439
par          501
hacker       708
'            823
computer    1020
Name: 0, dtype: int64

### TF Seems OK?

Consider our book and some of its most common words

- computer 
- hacking
- security 
- police
- network

### Normalised Term Frequency

These words seem to represent key topics of the book quite well. However, what about **mother**? This appears 113  times across the book, out of a vocabulary of  approx. 13,000 words. Compare this to a WhatsApp conversation that me and my sister had about our family Christmas that has the word **mother** 5 times in a conversation of about 50 words. When we compare just **term frequency**, it seems like the hacking book is far, far more (~20 times) about mothers than this text message chain. But thats not really the case. 

We use **normalised term frequency** to account for this, where the length of the document is used alongside the count to adjust for this.

In [35]:
#In a DataFrame, we can grab a column using its name, which is handy!
# So this is the column corresponding to the term mother: bag_of_words_df["mother"]
# Remember our DataFrame is a matrix, so we're going to get an array/list back from this, and we want the first element in this list, i.e. corresponding to our first/only row for our document
# So then we can use [0] to specify we wantthe first element:
bag_of_words_df["mother"][0]  #This now gives us the total number of counts of "mother" in our book

113

In [36]:
#Divide term frequency by total number of words in the document
book_tf = bag_of_words_df["mother"][0] / bag_of_words.sum()
text_msg_tf = 5.0 / 50.
#Much bigger normalised term frequency for text msgs
print("Book tf is ", book_tf, ", text msg tf is ", text_msg_tf)

Book tf is  0.0014047388180303821 , text msg tf is  0.1


### IDF

**IDF** stands for **I**nverse **D**ocument **F**requency and it tells us how important a word is in a particular document in comparison to the rest of the corpus. Up until this point we've been considering the book as one big document, but now we're going to take each chapter on its own, to see if we can see if we can highlight differences between them.

We can see below that most chapters have the terms **computer** and **hacker** featuring pretty heavily. 

The **IDF** is the ratio of all documents in comparison to how many documents the term appears in. 

It tells us how surprising is it that this word appeared here, given what we know about all the documents. 






## Getting the Vector for each document (chapter)

First, we use a **regex** to split it into chapters, as there is a recognisable formatting to this. This means our corpus is the whole novel, with each chapter considered a new document and we store the whole thing as a 1D array. Each item in the array is a string containing a chapter's worth of text.

### Examining the highest TF values

Looking at the Term Frequency (bag of words) for each chapter shows that each chapter has quite similar high frequency words like ``computer`` and ``hacker``. This isn't particularly useful if we want a representation that highlights the important terms **to that chapter**.

In [37]:
chapters = re.split(r'\s\s\s\s\s\sChapter+', book)

#We can always use "?" to learn about a variable in python
#This will tell us that our regular expression returned us a list with 11 elements:
?chapters

[1;31mType:[0m        list
[1;31mString form:[0m ['The Project Gutenberg EBook of Underground, by Suelette Dreyfus\n\nThis eBook is for the use of <...> oduce our new eBooks, and how to\nsubscribe to our email newsletter to hear about new eBooks.\n']
[1;31mLength:[0m      11
[1;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.

In [76]:
# We can also print out the list to see more, and we'll find that each element is the full string of text for the corresponding chapter:
chapters

['The Project Gutenberg EBook of Underground, by Suelette Dreyfus\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n** This is a COPYRIGHTED Project Gutenberg eBook, Details Below **\n**     Please follow the copyright guidelines in this file.     **\n\nTitle: Underground\n\nAuthor: Suelette Dreyfus\n\nRelease Date: August 24, 2012 [EBook #4686]\n\nLanguage: English\n\n\n*** START OF THIS PROJECT GUTENBERG EBOOK UNDERGROUND ***\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"I have donated my book "Underground" to Project Gutenberg\'s\ncollection in memory of my great aunt, Lucie Palmer. Lucie was an\nexplorer, a naturalist, a keen undersea diver and above all a gifted\npainter. In the last years of her life, she lost her vision due to\nmacular degeneration. She could no longer do her beloved unders

In [39]:
#Let's apply a CountVectorizer the same way as before, except now apply it to all 11 "documents" (chapters!)
count_vectoriser = CountVectorizer(tokenizer=my_tokeniser)
bag_of_words = count_vectoriser.fit_transform(chapters)

#bag_of_words now has 1 row per document, and 1 column per term in the vocab:
print(bag_of_words.todense().shape)

(11, 11770)




In [40]:
#We can put this in a dataframe again, which will still have 1 row per document and 1 column per term, 
# but it will be friendlier to work with as each column will have a name corresponding to the term
bag_of_words_df = pd.DataFrame(bag_of_words.todense(), columns = vocab)

bag_of_words_df

Unnamed: 0,Unnamed: 1,"""","""'","""/bin/login""'","""abcdef""","""addict""","""another","""anthrax","""asdfgh""","""because",...,zionist',zip,zone,zone',zonked,zoo,zoom,zoomed,zx81,~daemon
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,2,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,1,0,0,0,0
4,2,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,1,1,0,1
5,2,9,5,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
6,2,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [41]:
# Let's print out the top 10 words in each chapter, by word count:
for i in range(len(bag_of_words_df)):
    print("\nChapter", i)  #The \n character prints a new line before the next chapter
    print(bag_of_words_df.iloc[i].sort_values(ascending = False).head(10)) #An alternative syntax to calling .sort_values()[-10:] - note that this version prints in descending order which can be nice


Chapter 0
book           38
underground    25
hacker         23
computer       21
mountain       19
black          18
par            18
like           13
suelette       13
new            12
Name: 0, dtype: int64

Chapter 1
computer    288
worm        261
nasa        114
account     102
hacker       99
span         95
network      86
people       82
time         74
like         70
Name: 1, dtype: int64

Chapter 2
par         180
network      65
hacker       64
force        61
computer     55
alto         45
like         41
theorem      40
time         39
didn't       38
Name: 2, dtype: int64

Chapter 3
par         261
theorem      48
'            45
didn't       37
agent        37
room         36
secret       36
computer     36
hacker       33
service      33
Name: 3, dtype: int64

Chapter 4
'           226
phoenix     196
electron    178
computer    140
hacker       88
machine      88
zardoz       74
deszip       61
just         56
file         56
Name: 4, dtype: int64

Chapter 5
elec

### Examining the highest TF/IDF values

Now we want to see which words are important to each chapter. 

In [42]:
#Using the TFIDF Vectorizer to get TFIDF vectors with custom tokeniser. 
#The way we apply this is exactly the same as CountVectorizer, except it now gives us a matrix of tfidf values (rows are dcouments, columns are terms) instead of a matrix of word counts
tfidf_vectoriser = TfidfVectorizer(tokenizer=my_tokeniser)
tfidf = tfidf_vectoriser.fit_transform(chapters)
print(tfidf.todense().shape)



(11, 11770)


In [43]:
tfidf_df = pd.DataFrame(tfidf.todense(), columns = vocab)
#print(tfidf_df)
for i in range(len(tfidf_df)):
    print("\nchapter", i)
    print(tfidf_df.iloc[i].sort_values(ascending = False).head(10))


chapter 0
book             0.259165
mountain         0.250175
suelette         0.194637
julian           0.175161
par              0.158727
underground      0.156855
dreyfus          0.149721
`underground'    0.149721
assange          0.149721
hacker           0.144307
Name: 0, dtype: float64

chapter 1
worm        0.455574
computer    0.357677
span        0.247602
nasa        0.198986
mcmahon     0.149069
account     0.126677
w**k        0.126167
bowen       0.125104
hacker      0.122951
network     0.106806
Name: 1, dtype: float64

chapter 2
par         0.630041
theorem     0.168667
force       0.165135
network     0.161879
hacker      0.159388
alto        0.144310
computer    0.136974
citibank    0.135452
defcon      0.124801
machine     0.102871
Name: 2, dtype: float64

chapter 3
par         0.772681
theorem     0.171189
kentucky    0.129372
motel       0.128195
'           0.094788
agent       0.092146
par's       0.087527
nibbler     0.085450
room        0.082428
didn't      0.0

These words are the words that tell us the most about each chapter, in the context of the whole book.

It seems likes names (of people and of viruses?) are important distinctions between chapters. 

We also did lemmatisation instead of stemming and often have the same word, and its possesive version in a chapter (`anthrax` and `anthrax's`). Maybe stemming would be better?

# Comparing Document Vectors
So what we have now is a **vector** (an ordered list of numbers) for each document (in our case, each document is a chapter). This vector represents something about the text in that chapter based on the frequencies that words occur, and how that relates to the corpus as a whole. 

We can use these vectors calculate how similar two documents by calculating the distance between them. 

For this, we will generally use something called **cosine similarity** (also called "dot product") which essentially tells how similar two vectors are. The results go from -1 to 1, where 1 is exactly the same, 0 is nothing in common and -1 is **anti-similar**. (However, this never happens for TFIDF vectors, because word counts can never be negative!)

Let's compute the cosine similarity between each pair of chapters in the document, and display it on screen as a "similarity matrix":"



In [44]:
pip install Jinja2

Note: you may need to restart the kernel to use updated packages.


In [45]:
#Import the cosine similarity method from sklearn
from sklearn.metrics.pairwise import cosine_similarity as cosine
result = cosine(tfidf_df) #gives us a matrix for cosine similarity between all pairs of chapters
#Put the result in a dataframe:
df = pd.DataFrame(result)
#Display the matrix with heatmap style gradients:
df.style.background_gradient(cmap='Greens')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.256828,0.295226,0.289855,0.18923,0.183745,0.209863,0.176466,0.16495,0.147333,0.302989
1,0.256828,1.0,0.308646,0.195964,0.368159,0.302507,0.328194,0.382442,0.260568,0.238328,0.334303
2,0.295226,0.308646,1.0,0.739294,0.38292,0.316235,0.275956,0.274283,0.21575,0.206762,0.268272
3,0.289855,0.195964,0.739294,1.0,0.282055,0.243258,0.221948,0.161455,0.17828,0.138218,0.20255
4,0.18923,0.368159,0.38292,0.282055,1.0,0.720724,0.412985,0.302526,0.261026,0.216051,0.352627
5,0.183745,0.302507,0.316235,0.243258,0.720724,1.0,0.445504,0.237414,0.314378,0.203667,0.355287
6,0.209863,0.328194,0.275956,0.221948,0.412985,0.445504,1.0,0.264994,0.346665,0.178332,0.342815
7,0.176466,0.382442,0.274283,0.161455,0.302526,0.237414,0.264994,1.0,0.635501,0.225833,0.283895
8,0.16495,0.260568,0.21575,0.17828,0.261026,0.314378,0.346665,0.635501,1.0,0.160936,0.345479
9,0.147333,0.238328,0.206762,0.138218,0.216051,0.203667,0.178332,0.225833,0.160936,1.0,0.580587


(Unsurprisingly, each chapter is most similar to itself!)

What we also see is that we can begin to group documents together by how similar they are. Later in the class we  will teach you some more advanced methods for taking this idea further. 

Interestingly, the first chapter seems to be the most different from the rest, and I think that isn't a Chapter per se, but the preface. Also, consecutive chapters tend to be the most simliar to each other. 

## Using TF/IDF and cosine similarity to do a search

Here we have a collection of **104** Tom Waits lyrics (https://www.kaggle.com/datasets/albertsuarez/azlyrics)

We're going to show how you can use **TF/IDF** similarity to search for songs within it! As each TFIDF vector tells us something about the tokens present, and their importance to each song in relation to the wider catalogue, this will be a better search than simply matching exact strings.

In [46]:
#Load in Tom Waits
tom_waits = pd.read_csv("../data/tom_waits.tsv", delimiter="\t") #tsv is a "tab-delimited" file - you should take a look at it in a text editor to get the idea - you can make these yourself!
songs = tom_waits[["SONG_NAME","LYRICS"]]
songs.columns = ["Title","Lyric"]

In [47]:
#It's always good to look at your data before you do anything else
#Choose 10 songs from the dataframe to display.
songs.sample(10) 

Unnamed: 0,Title,Lyric
52,in between love,"in between love and trying to scheme love, who..."
6,rosie,"well i'm sitting on a windowsill, blowing my h..."
59,so it goes,"if i was a seagull high in a loof, i'd sail to..."
80,lost in the harbour,"over here the ladies all want sweet perfume, b..."
3,old shoes (& picture postcards),"i'm singing this song, it's time it was sung, ..."
28,trouble's braids,"well i pulled on trouble's braids, and i hid i..."
78,poor edward,"did you hear the news about edward?, on the ba..."
81,we're all mad here,"you can hang me in a bottle like a cat, let th..."
48,let me get up on it,c'mon let me get up on it
25,"down, down, down","he went down down down, and the devil called h..."


### The Search

Here's what we want to do:

1. Get TFIDF vectors for all your documents (songs)

2. Use **the same** process to get a TFIDF vector for your query 

3. Calculate cosine similarity between your query vector and all song vectors

4. Return the nearest match(es)

So now we can find songs that have **similar words** which are important in **similar ways**

In [48]:
#Create a new TF-IDF vectorizer using our tokeniser
tfidf_vectoriser = TfidfVectorizer(tokenizer=my_tokeniser)

In [49]:
#Get TFIDF for all songs in our collection. (Only apply this to the lyrics!)
tfidf = tfidf_vectoriser.fit_transform(songs["Lyric"])

#Save list of unique tokens (vocab) for later
vocab = tfidf_vectoriser.get_feature_names_out()

#Let's print out its shape: each row is a song, each column is a term in our vocabulary
print(tfidf.todense().shape)

(104, 2667)




Let's explore more: Print out the most important terms are for each song.

Interesting to see which songs the most important tokens are also the title. Not always the case!

In [50]:
#Make a DataFrame, of course!
tfidf_df = pd.DataFrame(tfidf.todense(), columns = vocab)

#for each song
for i in range(len(tfidf_df)):
    print("\n",songs.iloc[i]["Title"]) #print out a new line character, then the title
    print(tfidf_df.iloc[i].sort_values(ascending = False).head(10)) #print out top 10 tf-ifd terms for this song


 ol' 55
truck      0.335463
freeway    0.292749
car        0.268395
feeling    0.223642
riding     0.215443
sun's      0.197834
went       0.194062
luck       0.185340
lady       0.185340
coming     0.161037
Name: 0, dtype: float64

 i hope that i don't fall in love with you
hope      0.406561
fall      0.310512
love      0.303618
don't     0.267786
look      0.226836
turn      0.186307
chair     0.171597
just      0.133893
think     0.127423
you're    0.108946
Name: 1, dtype: float64

 virginia avenue
catching    0.547334
walking     0.273667
tell        0.256617
i'm         0.194285
avenue      0.182445
let         0.176979
closing     0.170923
dreaming    0.143162
got         0.124384
they're     0.123717
Name: 2, dtype: float64

 old shoes (& picture postcards)
kiss        0.324804
dear        0.275953
farewell    0.275953
anymore     0.275953
bind        0.275953
gone        0.246205
call        0.245008
goodbye     0.245008
eye         0.237111
i'll        0.235342
Name: 3, dtyp

### Conducting a query

In [51]:
#Get the TFIDF vector for your query
query = "fighting in a graveyard with taxi" #make this whatever you want

#We're not going to call .fit_transform() here because we don't want to re-"fit" the vectoriser -- we already know our vocabulary for this set of documents!
#Instead, just call .transform() - produce the tfidf values for a new document (our query)
query = tfidf_vectoriser.transform([query])  #don't forget to put query in [] because .transform() operates on a list of documents. Here, we've only got one, so [query] gives us a list of size 1.

In [52]:
#Get similarity between query and song catalogue
similarity_matrix = cosine(query, tfidf) #this returns a matrix, with 1 row and 104 columns
print("shape is", np.shape(similarity_matrix))

#We just one the one row as a list, so let's grab it out:
similarity = similarity_matrix[0]
print("Shape is now", np.shape(similarity))

shape is (1, 104)
Shape is now (104,)


In [53]:
#Get the closest 5 songs to the query and print
closest = np.argsort(similarity)[-5:] #the last item in this list is now the closest
closest = np.flip(closest) #Now they're descending order, top closest first

print("Closest songs:")
for i in closest:
    print(songs["Title"][i],": ")
    print(songs["Lyric"][i])
    print('\n')

Closest songs:
whistlin' past the graveyard : 
well i come in on a night train, with an arm full of box cars, on the wings of a magpie, cross a hooligan night, and i busted up a chifforobe, way out by the cocomo, cooked up a mess a mulligan, and got into a fight, whistlin' past the graveyard, steppin' on a crack, i'm a mean motherhubbard, papa one eyes jack, you probably seen me sleepin', out by the railroad tracks, go on and ask the prince of darkness, what about all thet smoke, come from the stack, sometimes i kill myself a jackel, suck out all the blood, steal myself a stationwagon, drivin' through the mud, whistlin' past the graveyard, steppin' on a crack, i'm mean motherhubbard, papa one eyed jack, i know you seen my headlights, and the honkin' of my horn, i'm callin' out my bloodhounds, chase the devil through the corn, last night i chugged the mississippi, now that suckers dry as a bone, born in a taxi cab, i'm never comin' home, whistlin' past the graveyard, steppin' on a crack

### LSA

Texts the say the same things, but with different words, will have completely different TFIDF vectors. We can do better!

Topic Modelling approaches attempt to improve this can grouping things together based on similar semantic meaning, not just frequency of terms. 

SVD will group together terms that occur frequently together in the same documents

The first thing we need to do is subtract the mean of each tfidf column from each value (sometimes called "whitening")

In [54]:
from sklearn.decomposition import TruncatedSVD

In [55]:
lyrics = pd.read_csv("../data/lyric_data.tsv", delimiter="\t")
songs = lyrics[["ARTIST_NAME-SONG_NAME","LYRICS"]]
songs.columns = ["Title","Lyric"]

In [56]:
#Print out a bit to investigate
songs

Unnamed: 0,Title,Lyric
0,portishead-mysterons,"inside your pretending, crimes have been swept..."
1,portishead-sour times,"to pretend no one can find, the fallacies of m..."
2,portishead-strangers,"ohh. can anybody see the light, where the morn..."
3,portishead-it could be sweet,"i don't want to hurt you, for no reason have i..."
4,portishead-wandering star,please could you stay awhile to share my grief...
...,...,...
1631,kaiser chiefs-girl of my age,"you were the one in the back, in the mack, i c..."
1632,kaiser chiefs-how do you feel about that?,"i'd take you down another two-year stretch, to..."
1633,kaiser chiefs-i like to fight,"i like to fight, throughout the night, i get a..."
1634,kaiser chiefs-listen to your head,there's a million combinations and this is one...


In [57]:
# just like we did before

#Get TFIDF
tfidf = tfidf_vectoriser.fit_transform(songs["Lyric"])
#Save list of unique tokens (vocab) for later
vocab = tfidf_vectoriser.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf.todense(), columns = vocab)
print(tfidf.todense().shape) #We have 1636 songs and 12319 terms -- that's a lot!!



(1636, 12319)


In [58]:
#Subtract mean
tfidf_df = tfidf_df - tfidf_df.mean()

In [59]:
#How many topics do you want?
num_topics = 16 #you can change this
pd.options.display.max_columns=num_topics #Make sure we display them all
labels = ['topic{}'.format(i) for i in range(num_topics)] #Give each topic a name: topic 0, topic 1, etc.

In [60]:
#Calculate topics using TruncatedSVD
svd = TruncatedSVD(n_components = num_topics, n_iter = 100) #You can change n_iter: Higher numbers will take longer but may (or may not) give you better results
svd_topic_vectors = svd.fit_transform(tfidf_df.values)

Now we have 16 values for each song (instead of 1000s!)

But there's more!

We can look at the weights LSA has assigned to each word within each topic. `svd.components_` is a variable that gives us these weightings:

In [61]:
#How much does each topic apply to each token?
topic_weights = pd.DataFrame(svd.components_.T, index=vocab, columns=labels)
topic_weights.sample(20) #display a random selection of 20 rows to display

#Here, each row is a term and each column is a topic

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
flagstone,-0.000455,-9.8e-05,0.000473,0.000279,-0.000134,0.000171,0.000115,-0.000523,-0.000812,0.000151,0.000824,-0.000117,-0.000115,-0.000144,0.000123,7.6e-05
subtle,-0.002641,0.00039,0.000943,-0.000568,0.000203,-0.000374,0.000352,-0.00068,0.001434,6.2e-05,-0.001335,-0.000189,0.00066,-0.000875,0.001028,-0.000892
holding,0.009746,0.008817,-0.001379,0.00927,-0.00378,-0.000315,0.008478,0.004346,0.000541,0.004947,-0.009221,0.000971,0.005191,-0.003741,0.007634,-0.004392
breathin,-2e-06,-9.9e-05,-0.000337,5e-05,0.000232,0.000102,0.000289,-0.000145,-0.000101,0.000191,0.000133,-0.000386,0.00044,9.8e-05,-6.5e-05,-0.000536
hoo,0.009467,-0.016874,0.023179,0.004137,-0.003811,-0.01,-0.001606,-0.011326,0.002522,-0.004508,-0.002038,0.007177,-0.014836,0.013988,0.016052,-0.002653
oh,0.229426,-0.280968,0.495409,0.513184,-0.348753,-0.019457,-0.105776,0.025769,0.000697,0.068018,-0.058293,-0.253383,0.027174,0.055535,-0.023879,0.123825
coyote,-0.000534,-0.000339,-5.3e-05,0.000353,8.1e-05,-0.000761,-0.000499,-0.001768,-0.00037,-0.000245,-0.001066,-0.000553,0.001444,-0.001017,-0.001702,0.000394
shifting,-0.000676,0.000638,-0.000771,0.000923,8.5e-05,-0.001266,-0.000354,2.4e-05,-0.000338,-0.000547,-0.000977,5.8e-05,-0.000412,0.000836,-0.001815,0.001751
tow,-0.000373,-8.4e-05,0.000289,-5.3e-05,-4.8e-05,0.000232,-8.3e-05,-7.2e-05,0.000229,-5e-06,-6.6e-05,-7e-06,-0.000163,7e-06,-4.1e-05,-0.000147
worker,-0.000302,-0.000103,-0.000415,0.000422,-8e-05,0.000232,-0.000962,0.000998,-0.001119,0.003388,0.000597,0.00125,0.001758,0.002556,0.000878,0.001363


In [62]:
#How much does each topic apply to each song?
svd_topic_vectors_df = pd.DataFrame(svd_topic_vectors, index=songs["Title"].values, columns=labels)
svd_topic_vectors_df.sample(10) #sample a random set of 10 songs to display

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
bob dylan-boots of spanish leather,0.105908,0.029379,-0.002764,0.131694,0.020253,-0.054412,-0.076743,0.129609,0.016946,0.007433,-0.034295,-0.031522,-0.015008,0.030277,0.038621,0.042378
"cliff, jimmy-remake the world",-0.081192,0.010487,0.041714,-0.048153,0.007269,0.012963,0.038047,0.00272,0.001306,-0.022577,-0.014063,0.018658,-0.04229,0.016035,-0.001108,0.033179
bob dylan-billy 1,-0.06366,-0.010124,-0.049056,-0.035769,-0.011413,0.009954,0.007739,-0.042217,-0.031598,0.001134,-0.034663,-0.049131,-0.02802,-0.002256,-0.016438,0.041665
britney spears-hold on tight,0.123992,0.007352,-0.164837,-0.019137,-0.175124,-0.12317,0.054345,0.04901,-0.103364,-0.024244,0.006977,-0.007201,0.016471,0.087236,-0.091084,-0.156926
elton john-old '67,-0.073929,0.018312,-0.032719,0.011262,0.014052,0.009937,0.021576,-0.046928,-0.120596,0.021927,0.027814,-0.050005,0.05801,0.018937,0.023389,-0.069234
les miserables cast-drink with me,-0.089548,0.028771,0.003076,-0.04075,-0.073334,-0.052379,0.053853,0.00957,-0.006228,-0.020885,0.037596,0.033291,0.009092,0.002896,0.002776,0.025294
"beatles, the-things we said today",0.219386,0.295014,0.097827,-0.015231,0.091664,0.052376,-0.008519,0.027006,-0.055707,-0.06406,0.075166,-0.014673,-0.003894,0.097456,-0.043939,0.056124
"beatles, the-twist and shout",0.025995,-0.148762,0.049602,-0.127935,0.032591,-0.055798,0.045676,-0.095323,0.053877,-0.014758,0.09223,0.009435,-0.005582,0.006382,0.010449,-0.025076
aerosmith-fall together,0.033297,0.001565,-0.048745,-0.020847,-0.003051,-0.084588,0.013782,-0.105817,-0.049786,0.016073,-0.046017,0.033316,0.037863,-0.022475,-0.047879,-0.043393
aerosmith-girls of summer,0.093953,0.01527,0.151528,0.040086,0.104825,0.142294,0.144682,0.084335,-0.179517,-0.20473,0.169103,0.028885,0.014757,0.165921,-0.094967,0.05443


In [63]:
#This is always helpful to do! 
#Show the most relevant words for each topic
num_terms = 20
for i in range(num_topics):
    print("___topic " + str(i) + "___")
    topicName = "topic" + str(i)
    weightedlist = topic_weights.get(topicName).sort_values()[-num_terms:]
    print(weightedlist.index.values)

___topic 0___
["can't" "i'll" 'feel' 'make' "it's" 'wanna' 'say' 'need' 'tell' 'let'
 'just' "you're" 'want' 'yeah' "i'm" 'oh' 'know' "don't" 'baby' 'love']
___topic 1___
['moon' 'hope' 'away' 'say' 'star' 'summer' 'real' 'tell' 'true' 'life'
 'gone' 'world' "i've" 'eye' 'time' 'heart' 'fall' 'day' "it's" 'love']
___topic 2___
['mm' 'thank' 'bye' 'mmm' "darlin'" 'ooo' 'ho' 'ha' 'ah' 'sweet' 'la'
 'ooh' 'whoa' 'oo' 'girl' 'hey' 'baby' 'yeah' 'love' 'oh']
___topic 3___
['tell' 'long' 'hard' 'like' 'home' 'standing' 'good' 'heart' 'think'
 'alright' 'gone' "i've" "she's" 'time' 'gonna' "it's" 'yeah' 'know' "i'm"
 'oh']
___topic 4___
['old' 'ready' 'pretty' 'honey' "that's" "you're" 'bad' 'blue' 'little'
 'hey' "she's" "ain't" 'love' 'good' 'yeah' 'got' 'gonna' 'girl' 'baby'
 "i'm"]
___topic 5___
['woman' 'round' 'really' 'talk' 'know' "ain't" 'dance' 'gotta' 'gonna'
 'love' "she's" 'ah' 'boy' 'girl' 'got' 'la' 'wanna' 'yeah' 'hey' "don't"]
___topic 6___
["we're" 'world' 'win' 'home' 'come

Makes me think that "yeah" should be a stop word for song lyrics?

In [64]:
#Query some strings and see how they relate to topics
df = topic_weights.T["ooh whoa christmas".split()]
df.style.background_gradient(cmap='Greens')

Unnamed: 0,ooh,whoa,christmas
topic0,0.062398,0.016103,-0.028641
topic1,-0.081349,0.01296,0.021802
topic2,0.051036,0.052496,0.011664
topic3,-0.014966,0.023576,-0.005431
topic4,-0.013499,-0.02198,-0.009351
topic5,-0.017662,0.013119,-0.047153
topic6,0.009704,0.004545,-0.010108
topic7,-0.031302,0.015441,0.041362
topic8,0.036416,0.00831,-0.008629
topic9,0.039404,0.008226,-0.023128


In [65]:
#Finally, we can look at which songs are most strongly weighted for a given topic:
svd_topic_vectors_df.sort_values(by=['topic1'], ascending=False) #Which songs are highest?

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
"beatles, the-love me do",0.383847,0.503786,0.371335,-0.108278,0.036856,0.094290,-0.003166,0.057399,0.144572,-0.000903,-0.090285,0.037162,0.002295,0.000384,0.097291,-0.015469
manic street preachers-you love us,0.322145,0.485852,0.379032,-0.133513,0.066986,0.098550,0.013152,0.026870,0.158145,0.012582,-0.096766,-0.047847,0.053381,-0.006077,0.052889,0.007916
"waits, tom-in between love",0.261893,0.429269,0.246974,-0.122353,0.076593,0.071784,-0.006318,0.016919,0.104303,0.002063,-0.071590,-0.020965,0.085347,-0.036897,0.005873,-0.051893
smokey robinson-i love your face,0.247896,0.409408,0.217615,-0.157721,0.048383,0.057057,0.043250,-0.019141,0.056378,0.020006,-0.086813,-0.036493,0.014257,-0.003672,0.021245,0.035105
"beatles, the-p.s. i love you",0.179223,0.375680,0.230239,-0.119771,0.034778,0.033505,-0.008645,0.077597,0.117394,-0.020660,-0.061337,0.012410,-0.051726,0.044273,0.086174,0.007794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
de la soul-baby baby baby baby ooh baby,0.240390,-0.346908,0.133359,-0.231231,0.310972,-0.267159,-0.047183,-0.125258,0.048106,0.083045,0.038211,-0.000384,-0.047224,-0.013051,-0.086759,-0.152297
beach boys-all i want to do,0.299757,-0.351161,0.010388,-0.362811,0.090568,-0.233200,-0.061597,0.074894,-0.075468,-0.113344,-0.150542,0.061722,-0.016193,0.022668,-0.025665,0.097909
smokey robinson-why you wanna see my bad side,0.441604,-0.381771,0.128881,-0.096418,0.006003,-0.113071,0.047417,0.047274,-0.103191,-0.040362,-0.036121,0.079553,0.172180,-0.084624,0.126819,-0.047774
britney spears-baby one more time,0.305410,-0.424502,0.196844,-0.021411,0.095409,-0.257584,-0.065832,-0.110873,0.086339,0.095114,0.034065,0.003390,-0.005761,-0.051515,-0.039779,-0.122033


## LDiA

Now let's apply LDiA to the same data.

In [66]:
#Import the needed module
from sklearn.decomposition import LatentDirichletAllocation

In [67]:
#We calculate LDA on the Bag Of Words, NOT TFIDF!
count_vectoriser = CountVectorizer(tokenizer=my_tokeniser)
bag_of_words = count_vectoriser.fit_transform(songs["Lyric"])
vocab = count_vectoriser.get_feature_names_out()

print(bag_of_words.todense().shape)



(1636, 12319)


In [68]:
lda = LatentDirichletAllocation(n_components=num_topics,  #how many topics?
                                random_state=123,
                                learning_method='batch')

In [69]:
#This may take some time depending on size of dataset!
lda_topics = lda.fit_transform(bag_of_words)

In [70]:
#How much does each topic apply to each song?
lda_topic_vectors_df = pd.DataFrame(lda_topics, index=songs["Title"].values, columns=labels)
lda_topic_vectors_df.sample(10)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
depeche mode-shout,0.000694,0.000694,0.133167,0.213034,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,0.000694,0.644771
aerosmith-hole in my soul,0.000359,0.000359,0.000359,0.000359,0.000359,0.000359,0.644717,0.000359,0.000359,0.000359,0.000359,0.000359,0.000359,0.000359,0.000359,0.350254
aerosmith-rocket 88,0.000607,0.000607,0.000607,0.321941,0.000607,0.000607,0.000607,0.000607,0.000607,0.000607,0.000607,0.387863,0.000607,0.000607,0.000607,0.282308
elton john-can i put you on,0.000568,0.000568,0.000568,0.000568,0.000568,0.000568,0.912049,0.000568,0.000568,0.000568,0.000568,0.000568,0.000568,0.000568,0.000568,0.079996
depeche mode-motherless child,0.002016,0.002016,0.002016,0.002016,0.002016,0.002016,0.002016,0.002016,0.002016,0.002016,0.002016,0.002016,0.002016,0.002016,0.002016,0.969758
britney spears-just luv me,0.000305,0.000305,0.000305,0.000305,0.000305,0.000305,0.000305,0.000305,0.000305,0.995427,0.000305,0.000305,0.000305,0.000305,0.000305,0.000305
de la soul-long island wildin',0.000393,0.000393,0.000393,0.000393,0.000393,0.000393,0.027701,0.10611,0.000393,0.000393,0.000393,0.000393,0.022618,0.838854,0.000393,0.000393
beach boys-car crazy cutie,0.610017,0.000291,0.000291,0.000291,0.000291,0.000291,0.000291,0.000291,0.385913,0.000291,0.000291,0.000291,0.000291,0.000291,0.000291,0.000291
beach boys-wind chimes,0.000644,0.000644,0.000644,0.000644,0.000644,0.990335,0.000644,0.000644,0.000644,0.000644,0.000644,0.000644,0.000644,0.000644,0.000644,0.000644
"waits, tom-shiver me timbers",0.000477,0.000477,0.000477,0.000477,0.022587,0.000477,0.970734,0.000477,0.000477,0.000477,0.000477,0.000477,0.000477,0.000477,0.000477,0.000477


### Comparing distributions of LSA and LDA

How are the distributions of topic scores different for LSA and LDiA? Compare the results for the same song

In [71]:
song_name = 'portishead-mysterons'

In [72]:
#We can print out the topic vectors from each technique
#This alone won't tell us much, as topic0 in SVD/LSA does not correspond at all to topic0 in LDiA
#However, we can immediately see that LDiA gives us very little weight on most topics, and stronger weight on just one topic

svd_topic_vectors_df.loc[song_name]

topic0    -0.033149
topic1    -0.086843
topic2    -0.019210
topic3    -0.144541
topic4    -0.010099
topic5     0.040664
topic6    -0.185396
topic7     0.258922
topic8    -0.105410
topic9    -0.066661
topic10   -0.092530
topic11    0.001251
topic12    0.058855
topic13   -0.166626
topic14    0.093101
topic15   -0.017662
Name: portishead-mysterons, dtype: float64

In [73]:
lda_topic_vectors_df.loc[song_name] 

topic0     0.001389
topic1     0.001389
topic2     0.001389
topic3     0.001389
topic4     0.001389
topic5     0.001389
topic6     0.001389
topic7     0.001389
topic8     0.001389
topic9     0.001389
topic10    0.001389
topic11    0.001389
topic12    0.001389
topic13    0.001389
topic14    0.001389
topic15    0.979167
Name: portishead-mysterons, dtype: float64

In [74]:
#Highest scoring songs for each LDiA topic
for topic in labels:
    print(topic)
    #Get last songs 10 in list (highest scoring for each topic)
    print(songs["Title"].values[lda_topic_vectors_df[topic].argsort().values[-10:]])

topic0
['elton john-queen of cities (el dorado ii)' 'britney spears-better'
 'dusty springfield-mockingbird' 'beach boys-help me, rhonda'
 'beatles, the-i am the walrus' 'waits, tom-diamonds on my windshield'
 'waits, tom-murder in the red barn' 'britney spears-liar'
 'waits, tom-hell broke luce'
 'britney spears-till the world ends (alex suarez club remix)']
topic1
['depeche mode-lilian' 'elton john-take me to the pilot'
 'manic street preachers-dead martyrs'
 'kaiser chiefs-dead or in serious trouble' 'bob dylan-paths of victory'
 'aerosmith-cheese cake' 'beach boys-hushabye' "beatles, the-searchin'"
 'aerosmith-lightning strikes' 'beach boys-pitter patter']
topic2
['manic street preachers-tsunami'
 'manic street preachers-another invented disease'
 'elton john-my quicksand' 'waits, tom-in the colosseum'
 'elton john-the new fever waltz' 'dusty springfield-heartbeat'
 'bruce springsteen-breakaway' 'beach boys-palisades park'
 'basement jaxx-get me off' 'de la soul-property of spitkic

In [75]:
#Most relevant tokens for each topic
for i, topic in enumerate(lda.components_):
    print("topic " + str(i) + ":")
    #Get last n tokens (highest values)
    print(vocab[topic.argsort()[-num_terms:]])

topic 0:
['just' 'said' "don't" 'wo' 'rhonda' "she's" 'way' 'dorado' 'el' "it's"
 'come' 'gonna' 'let' 'bye' 'home' "i'm" 'like' 'help' 'know' 'oh']
topic 1:
['fall' 'road' 'look' 'like' 'cuckoo' "i'm" 'gonna' 'dead' 'fun' 'just'
 'pitter' "what's" 'got' 'star' 'patter' 'light' 'oh' 'yeah' 'ooo' 'ah']
topic 2:
['waiting' 'come' "we've" 'night' 'cold' 'mind' "won't" 'got' 'yeah'
 'like' 'went' "i'm" 'just' 'body' 'need' 'oh' 'baby' 'kiss' 'heart'
 "it's"]
topic 3:
['like' 'woman' 'little' 'shake' 'bad' 'street' "it's" 'blue' 'right'
 'gonna' 'gotta' 'love' "you're" "ain't" 'come' 'good' "i'm" 'baby' 'got'
 'yeah']
topic 4:
['stop' 'bring' 'love' 'talk' 'better' 'day' 'know' "can't" 'boy' 'god'
 'time' 'little' 'come' 'yeah' 'oh' 'say' "don't" 'let' "i'm" 'hey']
topic 5:
['like' "it's" 'wind' 'tell' 'eye' 'look' 'lonely' "you'll" 'come' 'live'
 'time' 'day' 'run' 'world' 'life' 'just' "you're" 'away' 'know' "don't"]
topic 6:
["we're" 'thing' 'love' 'lost' 'number' 'just' 'play' 'time' "t