In [1]:
import re
import string
import os
import shutil


def clean_songs(song):
    
    pattern = r"[^a-zA-Z0-9\s" + re.escape(string.punctuation) + r"]"
    unusual_chars = re.findall(pattern, song)
    if len(unusual_chars) == 0:
        return True
    else:
        #print(f'Found : {unusual_chars} so discarded')
        return False
    
def read_file(file_path):
    
    with open(file_path, 'r', encoding="utf-8") as file:
        text = file.read()
    return text


def clean_dir(path):
    contents = os.listdir(path)
    for item in contents:
        item_path = os.path.join(path, item)
        if os.path.isfile(item_path):
            os.remove(item_path)
        elif os.path.isdir(item_path):
            clean_dir(item_path)  # Recursively clean subdirectories
            os.rmdir(item_path)  # Remove the empty directory

In [2]:
def expand_text(text):

    paragraphs = text.split("\n\n")

    for i in range(len(paragraphs)):
        p = paragraphs[i]
        exp = re.findall(r'\[([^\[\]]*)\]', p)
        if len(exp) == 1: #only one comment per paragrph
            pattern = r'\b(?:\d+x|x\d)\b'
            match = re.search(pattern, exp[0])
            if match: #if the comment contains a number
                if p[0] == "[":
                    if len(exp) == 1:
                        digit = match.group()
                        if digit[0].isalpha() != True:
                            num = int(digit[0])
                        else:
                            num = int(digit[1])
                        p = re.sub(r'\[.*?\]', '', p)
                        p = (p + '\n') * num
                        paragraphs[i] = p
                elif len(exp) != 0:
                    p_lines = p.split("\n")  # Split the paragraph into lines
                    index = 0
                    for j in range(len(p_lines)):  # Find the line to duplicate
                        if "[" in p_lines[j]:
                            index = j
                            break
                    line = p_lines[index]
                    digit = match.group()
                    if digit[0].isalpha() != True:
                        num = int(digit[0])
                    else:
                        num = int(digit[1])
                    line = re.sub(r'\[.*?\]', '', line)
                    line = (line + '\n') * num
                    p_lines[index] = line
                    p = "\n".join(p_lines)  # Join the modified lines back into a paragraph
                    paragraphs[i] = p

    text = "\n\n".join(paragraphs)
    return text

In [3]:
def featurings(artists_list, song_artist, text):
    
    paragraphs = text.split("\n\n")

    for i in range(len(paragraphs)):
        p = paragraphs[i]
        exp = re.findall(r'\[([^\[\]]*)\]', p)
        if len(exp) == 1: #only one comment per paragrph
            exp = exp[0].lower()
            for art in artists_list:
                if art.lower() != artist.lower():
                    if art.lower() in exp:
                        return True
    return False


In [4]:
def remove_non_solo_instrumental(text):
    pattern = r'\[[^\[\]]*\]'
    modified_text = re.sub(pattern, lambda match: match.group() if re.search(r'\bsolo\b|\binstrumental\b', match.group(), flags=re.IGNORECASE) else '', text)
    return modified_text

In [13]:
current_dir = os.getcwd()
print("Current working directory:", current_dir)

Current working directory: C:\Users\Antonia\Documents\Ecole Polytechnique\sem4\ML project\Text-Shazam


In [14]:
##TO BE RUN ONLY WHEN NEW SONGS
def create_Albums_Songs_Dirs():
    for art in artists:

        path_to_artist = os.path.join(path_to_songs, art)
        path_to_albums = os.path.join(path_to_artist, "Albums")
        path_to_songs = os.path.join(path_to_artist, "Songs")
        if not os.path.exists(path_to_albums): 
            os.makedirs(path_to_albums)
            os.makedirs(path_to_songs)

            for filename in os.listdir(path_to_artist):
                if os.path.isfile(os.path.join(path_to_artist, filename)):
                    shutil.move(os.path.join(path_to_artist, filename), path_to_albums)


Number of artists : 227


In [117]:
path_to_songs = os.path.join(os.getcwd(), "Raw_Songs_DataBase")

artists = os.listdir(path_to_songs)
artists = [art for art in artists if art[0] != "."]

print(f'Number of artists : {len(artists)}')


for art in artists: 

    path_to_artist = os.path.join(path_to_songs, art)
    path_to_artist_songs = os.path.join(path_to_artist, "Songs")
    path_to_artist_albums = os.path.join(path_to_artist, "Albums")
    
    clean_dir(path_to_artist_songs)
    
    albums = os.listdir(path_to_artist_albums) 
    for alb in albums:
        if alb[0] != ".": 
            path = os.path.join(path_to_artist_albums, alb)
            album = read_file(path)
            album = album.split("//") 
            album = [song for song in album if len(song) != 0] 
            if "Album" in album[0]:
                album.pop(0)
            for i in range(0, len(album), 2):
                title = (album[i].replace("/", "-")).strip()
                song = album[i+1]
                title = title.replace('À', 'A') #to make sure to not have two times the same pattern
                title = title.replace('à', 'a')
                song = expand_text(song)
                if featurings(artists, art, song) == False:
                    song = remove_non_solo_instrumental(song)
                    filepath = os.path.join(path_to_artist_songs, title)
                    if clean_songs(song):
                        file = open(filepath, 'w', encoding = "utf-8")
                        file.write(song)
                        file.close() 

    songs = os.listdir(path_to_artist_songs)
    print(f'{art} has {len(songs)} song')
                 

Number of artists : 15
Lana Del Rey has 113 song
Megadeth has 103 song
Beyoncé has 96 song
Snoop Dog has 54 song
Arctic Monkeys has 114 song
Ariana Grande has 69 song
Dolly Parton has 122 song
Dr Dre has 79 song
Dua Lipa has 95 song
Florence + the Machine has 105 song
Taylor Swift has 108 song
Vampire Weekend has 58 song
Waylon Jennings has 74 song
Ed Sheeran has 85 song
ASAP Rocky has 68 song


In [99]:
path_to_songs = os.path.join(os.getcwd(), "Raw_Songs_DataBase")
path_to_artist = os.path.join(path_to_songs, "Kendrick Lamar")
path_to_artist_songs = os.path.join(path_to_artist, "Songs")
path_to_artist_albums = os.path.join(path_to_artist, "Albums")

clean_dir(path_to_artist_songs)
clean_dir(path_to_artist_albums)
#clean_dir(path_to_artist_songs)
    

In [None]:
from nltk.stem import PorterStemmer

path = os.path.join(os.getcwd(), "rare.txt")
rare = read_file(path)
rare = rare.split("\n")

rare = [w.lower() for w in rare]

rare = [w for w in rare if re.match(r'^[a-z]+$', w)]

ps = PorterStemmer()
rare = [ps.stem(word) for word in rare]

path = os.path.join(os.getcwd(), "rare-filtered.txt")
with open(path, 'w') as file:
    for word in rare:
        file.write(word + '\n')
        
        

In [None]:
path = os.path.join(os.getcwd(), "slang.txt")
slang = read_file(path)
slang = slang.lower()

to_replace = ['"', '*', '#', '(', ')', '[', ']', ';', "'", '_', ".", '”', "?", "<", ">", "~", "!"]

for char in to_replace:
    slang = slang.replace(char, ' ')

slang = slang.split("\n")

slang = [w.split() for w in slang] #seperate each word in the expressions
slang = [word for sentence in slang for word in sentence] #flatten the list

slang = [w.lower().strip() for w in slang] #all lowercase, remove spaces

slang = [w for w in slang if len(w)>=2 and len(w) < 20]

slang = [w for w in slang if re.match(r'^[a-z]+$', w)] #only allow letters

ps = PorterStemmer()
slang = [ps.stem(word) for word in slang] #get the stems

slang = list(set(slang)) #remove duplicates

path = os.path.join(os.getcwd(), "stdfile.txt")
std = read_file(path)
std = std.split("\n")

std = [w.lower().strip() for w in std] #all lowercase, remove leading spaces
std = [ps.stem(word) for word in std] #get the stem

slang = [w for w in slang if w not in std]

path = os.path.join(os.getcwd(), "slang-filtered.txt")
with open(path, 'w') as file:
    for word in slang:
        file.write(word + '\n')
        

In [None]:
#to clean an artist
art = "Ed Sheeran"
path_to_songs = os.path.join(os.getcwd(), "Raw_Songs_DataBase")
path_to_artist = os.path.join(path_to_songs, art)
path_to_artist_songs = os.path.join(path_to_artist, "Songs")
path_to_artist_albums = os.path.join(path_to_artist, "Albums")

clean_dir(path_to_artist_albums)
