In [28]:
#Import necessary librararies and set directory

import os
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import nltk
import re

nltk.download('punkt')

folderPath = "projectFiles"

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexanderfisher/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
#Common filler words (To be removed from lyrics before analysis)
fillerWords = {
    'the', 'is', 'a', 'an', 'and', 'or', 'but', 'for', 'nor', 'yet', 'so',
    'in', 'to', 'of', 'on', 'with', 'at', 'by', 'from', 'about', 'as',
    'it', 'this', 'that', 'these', 'those', 'he', 'she', 'they', 'we', 'i', 'you',
    'what', 'like', 'ooh', 'oh', 'ah', 'la', 'yeah', 'na', 'woo', 'baby', 'uh', 'whoa'
}

#Regex for Chord Validation
chord_regex = re.compile(r"^[A-G][#b]?(m|maj|min|dim|aug|sus)?\d*$", re.IGNORECASE)


#Chord Validator Function
def isChord(text):
    return bool(chord_regex.match(text))
#Create Output File
output_path = os.path.join(folderPath, "all_song_analysis.txt")
with open(output_path, 'w', encoding='utf-8') as output_file:
    
    #Process files
    for filename in os.listdir(folderPath):
        if filename.endswith(".txt"):
            filePath = os.path.join(folderPath, filename)
    
            #Set variables to hold chords and lyrics
            chords = []
            lyrics = []
    
            # Read the file
            with open(filePath, 'r', encoding='utf-8') as f:
                lines = f.readlines()
    
            # Loop through each line individually
            for line in lines:
                # Split line into tokens
                lineTokens = line.strip().split()
    
                # Extract chords from current line
                lineChords = [token.capitalize() for token in lineTokens if isChord(token)]
                chords.extend(lineChords)
    
                # Remove chords, leaving lyrics
                lyricTokens = [token for token in lineTokens if not isChord(token)]
    
                # Tokenize lyrics and filter out filler words
                tokens = word_tokenize(' '.join(lyricTokens).lower())
                filteredTokens = [word for word in tokens if word.isalpha() and word not in fillerWords]
                lyrics.extend(filteredTokens)
                
            # Frequency distributions
            chord_fdist = FreqDist(chords)
            lyric_fdist = FreqDist(lyrics)
            
            # Output Results to Terminal
            print(f"\n--- Analysis of {filename} ---")
            
            print("\nMost Common Chords:")
            for chord, count in chord_fdist.most_common(10):
                print(f"{chord}: {count}")
            
            print("\nMost Common Words (excluding filler words):")
            for word, count in lyric_fdist.most_common(10):
                print(f"{word}: {count}")
    
            #Output Results to File
            output_file.write(f"\n--- Analysis of {filename} ---\n\n")
    
            output_file.write("Most Common Chords:\n")
            for chord, count in chord_fdist.most_common(10):
                output_file.write(f"{chord}: {count}\n")
    
            output_file.write("\nMost Common Words (excluding filler words):\n")
            for word, count in lyric_fdist.most_common(10):
                output_file.write(f"{word}: {count}\n")
    



--- Analysis of all_song_analysis.txt ---

Most Common Chords:

Most Common Words (excluding filler words):

--- Analysis of song-8.txt ---

Most Common Chords:
C: 38
D: 36
Em: 16
G: 16
Bm: 14
Am: 9
B: 8
A: 2

Most Common Words (excluding filler words):
stand: 9
up: 8
cold: 8
black: 5
smoke: 5
fires: 5
will: 5
rises: 4
been: 4
told: 4

--- Analysis of song-2.txt ---

Most Common Chords:
A: 36
D: 36
G: 20
F#: 18
E: 6
Am: 6

Most Common Words (excluding filler words):
my: 11
love: 8
bleeding: 7
got: 6
heart: 6
army: 6
apart: 6
all: 5
brothers: 5
stand: 5

--- Analysis of song-3.txt ---

Most Common Chords:
A: 55
D: 50
G: 21
F: 2

Most Common Words (excluding filler words):
ma: 52
outta: 7
sight: 7
alright: 6
our: 4
verse: 3
day: 3
love: 3
night: 3
are: 3

--- Analysis of song-1.txt ---

Most Common Chords:
A: 42
G: 42
D: 42

Most Common Words (excluding filler words):
your: 22
mama: 18
heart: 14
do: 13
love: 12
when: 9
all: 9
gon: 8
give: 7
wo: 6

--- Analysis of song-4.txt ---

Most Co