In [53]:
import re
import string
import os
import shutil
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt


def clean_songs(song):
    
    pattern = r"[^a-zA-Z0-9\s" + re.escape(string.punctuation) + r"]"
    unusual_chars = re.findall(pattern, song)
    if len(unusual_chars) == 0:
        return True
    else:
        #print(f'Found : {unusual_chars} so discarded')
        return False
    
def read_file(file_path):
    
    with open(file_path, 'r', encoding="utf-8") as file:
        text = file.read()
    return text


def clean_dir(path):
    contents = os.listdir(path)
    for item in contents:
        item_path = os.path.join(path, item)
        if os.path.isfile(item_path):
            os.remove(item_path)
        elif os.path.isdir(item_path):
            clean_dir(item_path)  # Recursively clean subdirectories
            os.rmdir(item_path)  # Remove the empty directory

In [54]:
def expand_text(text):

    paragraphs = text.split("\n\n")

    for i in range(len(paragraphs)):
        p = paragraphs[i]
        exp = re.findall(r'\[([^\[\]]*)\]', p)
        if len(exp) == 1: #only one comment per paragrph
            pattern = r'\b(?:\d+x|x\d)\b'
            match = re.search(pattern, exp[0])
            if match: #if the comment contains a number
                if p[0] == "[":
                    if len(exp) == 1:
                        digit = match.group()
                        if digit[0].isalpha() != True:
                            num = int(digit[0])
                        else:
                            num = int(digit[1])
                        p = re.sub(r'\[.*?\]', '', p)
                        p = (p + '\n') * num
                        paragraphs[i] = p
                elif len(exp) != 0:
                    p_lines = p.split("\n")  # Split the paragraph into lines
                    index = 0
                    for j in range(len(p_lines)):  # Find the line to duplicate
                        if "[" in p_lines[j]:
                            index = j
                            break
                    line = p_lines[index]
                    digit = match.group()
                    if digit[0].isalpha() != True:
                        num = int(digit[0])
                    else:
                        num = int(digit[1])
                    line = re.sub(r'\[.*?\]', '', line)
                    line = (line + '\n') * num
                    p_lines[index] = line
                    p = "\n".join(p_lines)  # Join the modified lines back into a paragraph
                    paragraphs[i] = p

    text = "\n\n".join(paragraphs)
    return text

In [55]:
def featurings(artists_list, song_artist, text):
    
    paragraphs = text.split("\n\n")

    for i in range(len(paragraphs)):
        p = paragraphs[i]
        exp = re.findall(r'\[([^\[\]]*)\]', p)
        if len(exp) == 1: #only one comment per paragrph
            exp = exp[0].lower()
            for art in artists_list:
                if art.lower() in exp:
                    return True
    return False


In [56]:
def remove_non_solo_instrumental(text):
    pattern = r'\[[^\[\]]*\]'
    modified_text = re.sub(pattern, lambda match: match.group() if re.search(r'\bsolo\b|\binstrumental\b', match.group(), flags=re.IGNORECASE) else '', text)
    return modified_text

In [57]:
current_dir = os.getcwd()
print("Current working directory:", current_dir)

Current working directory: C:\Users\Antonia\Documents\Ecole Polytechnique\sem4\ML project\Text-Shazam


In [58]:
artists = os.listdir(path_to_songs)
artists = [art for art in artists if art[0] != "."]
path_to_songs = os.path.join(os.getcwd(), "Raw_Songs_DataBase2")

In [59]:
##TO BE RUN ONLY WHEN NEW SONGS
def create_Albums_Songs_Dirs():
    
    for art in artists:
        
        path_to_songs = os.path.join(os.getcwd(), "Raw_Songs_DataBase2")
        path_to_artist = os.path.join(path_to_songs, art)
        path_to_albums = os.path.join(path_to_artist, "Albums")
        path_to_songs = os.path.join(path_to_artist, "Songs")
        if not os.path.exists(path_to_albums): 
            os.makedirs(path_to_albums)
            os.makedirs(path_to_songs)

            for filename in os.listdir(path_to_artist):
                if os.path.isfile(os.path.join(path_to_artist, filename)):
                    shutil.move(os.path.join(path_to_artist, filename), path_to_albums)


In [60]:
#create_Albums_Songs_Dirs()

In [52]:
def is_title_legal(title):
    illegal_characters = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
    
    for char in illegal_characters:
        if char in title:
            return False
    return True

In [63]:

print(f'Number of artists : {len(artists)}')

total_songs = 0

for art in artists: 

    path_to_artist = os.path.join(path_to_songs, art)
    path_to_artist_songs = os.path.join(path_to_artist, "Songs")
    path_to_artist_albums = os.path.join(path_to_artist, "Albums")
    
    clean_dir(path_to_artist_songs)
    
    albums = os.listdir(path_to_artist_albums) 
    for alb in albums:
        if alb[0] != "." and "readme" not in alb.lower(): 
            path = os.path.join(path_to_artist_albums, alb)
            album = read_file(path)
            if album[0] == "/":
                album = '\n' + album
            album = re.split(r'\n//|//\n', album)
            album = [song for song in album if len(song) != 0] 
            if "album" in album[0].lower():
                album.pop(0)
            for i in range(0, len(album), 2):
                title = (album[i].replace("/", "-")).strip()
                if len(title) == 1:
                    continue
                song = album[i+1]
                title = title.replace('À', 'A') #to make sure to not have two times the same pattern
                title = title.replace('à', 'a')
                #if art == "Deadmau5":
                    #print(alb)
                    #print(title)
                if title[-1] == "?":
                    title = title[:-1]
                if is_title_legal(title):
                    song = expand_text(song)
                    if featurings(artists, art, song) == False:
                        song = remove_non_solo_instrumental(song)
                        filepath = os.path.join(path_to_artist_songs, title)
                        if clean_songs(song):
                            file = open(filepath, 'w', encoding = "utf-8")
                            file.write(song)
                            file.close() 

    songs = os.listdir(path_to_artist_songs)
    print(f'{art} has {len(songs)} song')
    total_songs += len(songs)
    
print(f'\nTotal number of songs : {total_songs}')
                 

Number of artists : 232
ABBA has 113 song
Adele has 69 song
Al Green has 201 song
Amii Stewart has 61 song
Andrew Bird has 161 song
Arcade Fire has 98 song
Arch Enemy has 124 song
Arctic Monkeys has 126 song
Aretha Franklin has 198 song
Ariana Grande has 144 song
ASAP Rocky has 79 song
Ashley McBryde has 58 song
avenged sevenfold has 90 song
Average White Band has 116 song
Avicii has 63 song
B.B. King has 201 song
Baroness has 57 song
Barry White has 143 song
Beabadoobee has 65 song
Beach House has 101 song
Bee Gees has 199 song
Behemoth has 135 song
Between the Buried and Me has 78 song
Big Thief has 80 song
Billie Eilish has 60 song
Billie Holiday has 193 song
Bob Marley has 115 song
Bon Iver has 49 song
Bonnie Raitt has 200 song
Brandi Carlile has 103 song
BROCKHAMPTON has 114 song
Brothers Johnson has 32 song
Bruno Mars has 75 song
Built to Spill has 115 song
Calvin Harris has 80 song
Cameo has 64 song
Camila Cabello has 74 song
Cannibal Corpse has 173 song
Car Seat Headrest has 14

In [71]:
artists = os.listdir(path_to_songs)
artists = [art for art in artists if art[0] != "."]
artists = sorted(artists)

songs_per_artist={art : 0 for art in artists}


for art in artists: 
    

    path_to_artist = os.path.join(path_to_songs, art)
    path_to_artist_songs = os.path.join(path_to_artist, "Songs")
    
    songs = os.listdir(path_to_artist_songs)
    for song in songs:
        if song[0] != ".":
            songs_per_artist[art] += 1
                
keys = list(songs_per_artist.keys())
values = [v / 2 for v in songs_per_artist.values()]

In [99]:
path_to_songs = os.path.join(os.getcwd(), "Raw_Songs_DataBase")
path_to_artist = os.path.join(path_to_songs, "Kendrick Lamar")
path_to_artist_songs = os.path.join(path_to_artist, "Songs")
path_to_artist_albums = os.path.join(path_to_artist, "Albums")

clean_dir(path_to_artist_songs)
clean_dir(path_to_artist_albums)
#clean_dir(path_to_artist_songs)
    

In [55]:
path_to_dict = os.path.join(os.getcwd(), "Dictionaries")

In [56]:
path = os.path.join(path_to_dict, "rare.txt")
rare = read_file(path)
rare = rare.split("\n")

rare = [w.lower() for w in rare]

rare = [w for w in rare if re.match(r'^[a-z]+$', w)]

ps = PorterStemmer()
rare = [ps.stem(word) for word in rare]

path = os.path.join(path_to_dict, "rare-filtered.txt")
with open(path, 'w') as file:
    for word in rare:
        file.write(word + '\n')
        

In [65]:
path = os.path.join(path_to_dict, "slang.txt")
slang = read_file(path)
slang = slang.lower()

to_replace = ['"', '*', '#', '(', ')', '[', ']', ';', "'", '_', ".", '”', "?", "<", ">", "~", "!"]

for char in to_replace:
    slang = slang.replace(char, ' ')

slang = slang.split("\n")

slang = [w.split() for w in slang] #seperate each word in the expressions
slang = [word for sentence in slang for word in sentence] #flatten the list

slang = [w.lower().strip() for w in slang] #all lowercase, remove spaces

slang = [w for w in slang if len(w)>=2 and len(w) < 20]

slang = [w for w in slang if re.match(r'^[a-z]+$', w)] #only allow letters

ps = PorterStemmer()
slang = [ps.stem(word) for word in slang] #get the stems

slang = list(set(slang)) #remove duplicates

path = os.path.join(path_to_dict, "stdfile.txt")
std = read_file(path)
std = std.split("\n")

std = [w.lower().strip() for w in std] #all lowercase, remove leading spaces
std = [ps.stem(word) for word in std] #get the stem

slang = [w for w in slang if w not in std]

path = os.path.join(path_to_dict, "slang-filtered.txt")
with open(path, 'w') as file:
    for word in slang:
        file.write(word + '\n')
        

In [48]:
#to clean an artist
def clean_artist(art):
    path_to_songs = os.path.join(os.getcwd(), "Raw_Songs_DataBase")
    path_to_artist = os.path.join(path_to_songs, art)
    path_to_artist_songs = os.path.join(path_to_artist, "Songs")
    path_to_artist_albums = os.path.join(path_to_artist, "Albums")

    clean_dir(path_to_artist_albums)
