In [None]:
# The only variable you need to specify is music library location
misuc_lib_path = ""

In [None]:
from scipy.io import wavfile
import matplotlib.pyplot as plt
import pandas as pd
import os
import subprocess
import sys
import requests as req
from pyquery import PyQuery
from datetime import datetime
import json
from itertools import combinations
import math
import sklearn as sk # sklearn.linear_model.LogisticRegression

### There is one part that is written not in Python: conversion mp3 to wav is done via Linux app called "mpg321" in function convert_mp3_to_wav. Everything else is Python.

In [None]:
# General functions and variables
gradual_steps = [10, 50, 100, 500, 
     1000, 5000, 10000, 50000,
     100000, 500000, 1000000, 5000000, 
     10000000, 50000000, 100000000, 500000000, 
     1000000000, 5000000000, 10000000000, 50000000000]

def large_int_print(num):
    final_str = ""
    num_str = list(str(num))
    section_size=0
    while num_str:
        final_str+=num_str.pop()
        if section_size>0 and section_size==2 and len(num_str)!=0:
            final_str+=","
            section_size=0
        else:
            section_size+=1
    return final_str[::-1]

def get_time_left(start_time, total_items, done_items):
    time_spent = datetime.now() - start_time
    if time_spent.seconds==0: return "?"
    rate_ps = done_items / time_spent.seconds
    if rate_ps==0: return "?"
    return print_time(sec_num=round((total_items - done_items)/rate_ps,2))

def print_time(sec_num=None, time_dif=None):
    "Pring nice time from number of seconds or time_dif object"
    if not sec_num:
        if time_dif:
            sec_num = time_dif.seconds
        else:
            print("No values provided for time prointing.")
            return None
    if sec_num < 60:
        return str(round(sec_num)) + "s"
    elif sec_num >= 60 and sec_num < 60*60:
        return (
            str(round(sec_num//60)) + "m" 
            + str(round(sec_num%60)) + "s")
    else:
        return (
            str(round(sec_num//(60*60))) + "h" 
            + str(round((sec_num%(60*60))//60)) + "m" 
            + str(round(sec_num%60)) + "s")

def print_dict(d, sample):
    for i, (k, v) in enumerate(d.items()):
        print(k,":",v)
        if i>=sample:
            break
            
def get_pretty_volume(bytes_num):
    num_size = len(str(bytes_num)) / 3
    def p(val, pow):
        return str(round(val/(1024**pow), 2))
    if num_size > 5: return p(bytes_num,5) + " pb"
    elif num_size > 4: return p(bytes_num,4) + " tb"
    elif num_size > 3: return p(bytes_num,3) + " gb"
    elif num_size > 2: return p(bytes_num,2) + " mb"
    elif num_size > 1: return p(bytes_num,1) + " kb"
    else: return str(bytes_num) + " bytes"
    
def remove_files(lib_path, ext_filter, limit_num):
    log("Removing",ext_filter,"files from",lib_path)
    files = get_all_files(lib_path, ext_filter=ext_filter)
    if limit_num: 
        to_remove_num = limit_num
        log("Found",len(files),"will remove",limit_num,"of them.")
    else:
        to_remove_num = len(files)
        log("Found",len(files),"will remove all of them.")
    freed_space = 0
    for i, file in enumerate(files):
        if i >= to_remove_num: break
        if os.path.isfile(file):
            freed_space += os.path.getsize(file)
            os.remove(file)
        else:
            log(file, "is not a file.")
        if ((i+1)==10 or (i+1)==50 or (i+1)%100==0):
            log("Removed", (i+1), "files. Freed up", get_pretty_volume(freed_space))
    log("Finished cleaning library.\n\tRemoved", to_remove_num, 
        "files. Freed up", get_pretty_volume(freed_space))

In [None]:
def get_all_files(path, ext_filter="MP3"):
    w = os.walk(path)
    # folder / subfolders / files
    all_files = []
    for sw in w:
        all_files += [sw[0] + "/" + x for x in sw[2]]
    files = []
    for f in all_files:
        if f.split(".")[-1].upper() == ext_filter.upper():
            files.append(f)
    return files

def convert_mp3_to_wav(filename_mp3, filename_wav=None):
    cd = os.getcwd() # If filename doesn't contain path, use current directory
    if "/" not in filename_mp3:
        filename_mp3 = cd + "/" + filename_mp3
    filename_wav =  ".".join(filename_mp3.split(".")[0:-1]) + ".wav"
    subprocess.run(["mpg321", "-w", filename_wav, filename_mp3])
    return filename_wav

def get_wav_data_old(filepath, bucket_width=100):
    """ 
    Data of wav file is very much narrowed down to what I want to analyse. 
    So if you want to analyse in your own way, this is the function to change.
    """
    fs, data = wavfile.read(filepath)
    df_data = pd.DataFrame(data)
    df_data['max'] = df_data.max(axis=1)
    df_data['buckets'] = pd.Series(df_data['max']/bucket_width).astype(int)
    bucket_sizes = df_data.groupby('buckets').size()
    bucket_sizes = bucket_sizes.drop(bucket_sizes.idxmax())
    return (bucket_sizes/max(bucket_sizes))

In [None]:
def get_full_wav_data(filepath, **kwargs):
    try:
        fs, data = wavfile.read(filepath)
        df_data = pd.DataFrame(data)
        df_data['max'] = df_data.max(axis=1)
    except Exception as e:
        print("Failed to read",filepath)
        raise
    return df_data

def get_experimental_wav_data(filepath, buckets_num=100):
    try:
        fs, data = wavfile.read(filepath)
        df_data = pd.DataFrame(data)
        df_data['max'] = df_data.max(axis=1)
        #df_data['scale'] = (df_data['max'] + 32768) / (32768 + 32767) # -32768 32767
        if df_data['max'].max() > 32767 or df_data['max'].min() < -32768:
            print("!!! Something wrong with scale !!!")
        df_data['bucket'] = ((df_data['max'] + 32768) / ((32768 + 32767) / buckets_num)).astype('int')
        bucket_sizes = df_data.groupby('bucket').count()
        bucket_sizes = bucket_sizes.drop(bucket_sizes.idxmax())
        bucket_sizes['max'] = bucket_sizes['max'] / bucket_sizes['max'].max()
    except Exception as e:
        print("Failed to read",filepath)
        raise
    return pd.Series(bucket_sizes['max'])

def get_wav_data(filepath, bucket_width=100):
    """ 
    Data of wav file is very much narrowed down to what I want to analyse. 
    So if you want to analyse in your own way, this is the function to change.
    """
    try:
        fs, data = wavfile.read(filepath)
        df_data = pd.DataFrame(data)
        df_data['max'] = df_data.max(axis=1)
        df_data['buckets'] = (df_data['max']/bucket_width).astype('int')
        bucket_sizes = df_data.groupby('buckets').count()
        bucket_sizes = bucket_sizes.drop(bucket_sizes.idxmax())
    except Exception as e:
        print("Failed to read",filepath)
        raise
    return bucket_sizes.index/bucket_sizes['max'].max()

In [None]:
def log(*args):
    print(datetime.now(), " ".join([str(a) for a in args]))
    
def process_library(
        lib_path, 
        proc_func=None, 
        max_files=None, 
        partial_data=None,
        number_of_buckets=100,
        keep_wav=True):
    """
    For all mp3 files in the lib:
        - Convert to wav, if wav doesn't exists
        - Get aggregated data from wav file
    """
    log("Analyzing library...")
    lib_data = pd.DataFrame(columns=list(range(number_of_buckets+1)))
    if not partial_data.empty: 
        lib_data = partial_data
        log("Recieved data of",len(lib_data),"processed files.")
    process_counter = 0
    convert_counter = 0
    skipped = []
    log("Begining library scan...")
    files_to_process = get_all_files(lib_path, ext_filter="MP3")
    log("Library is scanned. Found", len(files_to_process), "mp3 files.")
    if max_files:
        log("Files to process:",max_files)
    else:
        log("Files to process:",len(files_to_process))
    log("Begin library processing...")
    start_time = datetime.now()
    if max_files:
        total_to_proc = max_files
    else:
        total_to_proc = len(all_data_long) - len(files_to_process)
    for mp3_file in files_to_process:
        if mp3_file in lib_data.index:
            continue
        try:
            wav_file = (".".join(mp3_file.split(".")[0:-1]) + ".wav")
            if not os.path.isfile(wav_file):
                # I'm saving wav next to mp3, you can change it here.
                wav_file = convert_mp3_to_wav(mp3_file, wav_file)
                convert_counter+=1
            wav_data = proc_func(wav_file, buckets_num=100) #bucket_width=300)
            # ? Save the thing into sepate json next to wav and mp3 for each song? 
            if not keep_wav:
                os.remove(wav_file)
            lib_data.loc[mp3_file] = wav_data
            process_counter+=1
            if (process_counter in gradual_steps or process_counter%1000==0):
                log("Processed", process_counter,
                    "files, of them converted", convert_counter, "\n\tTime left:", 
                    get_time_left(start_time, total_to_proc, process_counter),
                    " | Dataframe volume:", 
                    get_pretty_volume(sys.getsizeof(lib_data)))
        except Exception as e:
            print("Couldn't process:\n", mp3_file, "\n", wav_file, str(e))
            skipped.append(mp3_file)
        if max_files: 
            if process_counter>=max_files: 
                break
    log("Finished!\n\tSongs processed:", len(lib_data), "of", len(files_to_process),
        "\n\tDataframe valume:", get_pretty_volume(sys.getsizeof(lib_data)),
        "\n\tSkipped:", len(skipped), "files",
        "\n\tTime spent:", print_time(time_dif=datetime.now()-start_time)
        )
    return skipped, lib_data

In [None]:
full_data_dump_file_path = misuc_lib_path + "/Full_Data.json" # Your data dump file will be here
proc_round = 0

def prepare_data(lib_path, dump_path, file_limit):
    global proc_round
    proc_round+=1
    print("Round",proc_round)
    
    # Load if json exists
    all_data = pd.DataFrame() # Dict that contains DataFrame and song names as keys
    if os.path.isfile(dump_path):
        all_data = pd.read_json(dump_path)

    # Keeping whole file paths to be able to manipulate files later
    (skipped_files, all_data) = process_library(
        misuc_lib_path,
        proc_func=get_experimental_wav_data,
        max_files=file_limit,
        partial_data=all_data,
        keep_wav=True)

    # Save results into json
    # Dumping doesn't work for now, due to issut with dumping the whole thing
    all_data.to_json(dump_path)
    print("Data is dumped to",dump_path)
    print("\n\n")
    return all_data

"! Warning ! WAV files takes about 10 times more space then MP3! Beware of free space on your disc."
cycles_to_run = 1
nuber_of_files_per_cycle = 6

for i in range(cycles_to_run): # Iterate to have regular dumps, think this way is more reliable
    all_data_long = prepare_data(
        misuc_lib_path, 
        full_data_dump_file_path, 
        nuber_of_files_per_cycle)
log("Finished processing.")
    
# Clean names (shorten for convinience and memory optimization), saves as a separate dataframe
# Keeping 2 data structures for experimentation convinience, feel free to use only one and remove another!
#all_data_clean_dict = {k[len(misuc_lib_path):]:v for k, v in all_data.items()}
#all_data_df = pd.DataFrame([(k[len(misuc_lib_path):], v) for k, v in all_data.items()])
#print("Lib path is removed from file names, dictionary is converted to DataFrame")

In [None]:
def compare_all_songs(buckets_df, sample_size=None):
    "Compares songs. Approx 100.000 songs a minute for 100-column df"
    log("Begining comparison...")
    all_combinations = pd.DataFrame(
        data=[[s1,s2,None] for s1, s2 in combinations(list(buckets_df.index)[:sample_size], 2)], 
        columns=["Song1", "Song2", "Distance"])
    #all_combinations["Distance"] = pd.to_numeric(all_combinations["Distance"])
    
    if sample_size:
        total_to_proc = sample_size
    else:
        total_to_proc = len(buckets_df)
    log("Of",total_to_proc,"of songs, number of unique pairs:",
        large_int_print(len(all_combinations)))
    start_time = datetime.now()
    for index, row in all_combinations.iterrows():
        if index+1 in gradual_steps[4:] or (index+1)%1000000==0: 
            log("Processed", large_int_print(index+1), 
                "rows. Approximate time left:",
                get_time_left(start_time, len(all_combinations), index+1))
            
            # Size of DF?
            
        song1_name = row[0] 
        song2_name = row[1]
        if song1_name == song2_name:
            print("Comparing to itself :(")
        song1_data = all_data_long.loc[song1_name].copy()
        song2_data = all_data_long.loc[song2_name].copy()
        #all_combinations.iloc[index]["Distance"] = sum_of_abs_of_arrays(
        #    pd.Series(song1_data - song2_data))
        all_combinations.iloc[index]["Distance"] = abs((song1_data - song2_data)).sum()
    all_combinations["Distance"] = pd.to_numeric(all_combinations["Distance"])
    log("Completed comparison of",total_to_proc,"songs.",
       "Time spent:", print_time(time_dif=datetime.now()-start_time))
    return all_combinations

#comb = compare_all_songs(all_data_long, sample_size=10)


In [None]:
#pd.Series
comb = compare_all_songs(all_data_long, sample_size=1000)

In [None]:
comb = compare_all_songs(all_data_long, sample_size=10)
s = comb["Song1"]
#comb["Song1"] = comb["Song1"][len("/media/shad/DATA/MuzAnalysis/"):]
#comb.iloc[1]

In [None]:
comb = compare_all_songs(all_data_long, sample_size=100)
for i in range(1,3):
    comb["Song" + str(i)] = pd.Series(
        comb["Song" + str(i)]).str.slice(
            start=len(misuc_lib_path))

for i in range(1,3):
    comb["Song" + str(i)] = pd.Series(
        comb["Song" + str(i)]).str.split("/", expand=True)

#gr = (columns=["Song1","Song2","Mean Distance","Number of songs"])
gr = pd.DataFrame(comb.groupby(["Song1","Song2"])["Distance"].mean())
gr["Number of pairs"] = comb.groupby(["Song1","Song2"])["Distance"].count()
#gr = comb.groupby(["Song1", "Song2"], as_index=True, axis=1)
#(comb.groupby(["Song1", "Song2"], index=False)).mean()["Distance"]
gr

In [None]:
gr.to_csv(misuc_lib_path + "/comaprison_result.csv")

In [None]:
comb.to_json(misuc_lib_path + "/comaprison_data.json")

In [None]:
logreg = sk.linear_model.LogisticRegression(
    penalty=’l2’, 
    dual=False, 
    tol=0.0001, 
    C=1.0, 
    fit_intercept=True, 
    intercept_scaling=1, 
    class_weight=None, 
    random_state=None, 
    solver=’warn’, 
    max_iter=100, 
    multi_class=’warn’, 
    verbose=0, 
    warm_start=False, 
    n_jobs=None, 
    l1_ratio=None
    )

# Experiment starting from here
### Everythin that follows is in very raw state

In [None]:
chart_sample = 3 # Limit sample of songs to build charts
for k, v in all_data.items():
    if chart_sample > 0:
        chart_sample-=1
        print(k)
        plt.plot(v)
        plt.show()
    else:
        break

In [None]:
def make_combinations(data_dict):
    # Make a list of all combinations, to have list of comparisons.
    prodj = combinations(list(data_dict.keys()), 2)
    prodj = list(prodj)
    print("Number of possible comparisons:", len(list(prodj)))
    return prodj

#comb = make_combinations(all_data)

In [None]:
import pprint
# Check out all combination pairs
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(comb)

In [None]:
def compare_songs(prodj, data_dict):
    print("Pairs to compare: " + str(len(prodj)))
    comparing_data = pd.DataFrame(columns=['song1','song2','dif'])
    comparing_data["dif"] = pd.to_numeric(comparing_data["dif"]) 

    comparing_counter = 0
    for song1_name, song2_name in prodj:
        try:
            if song1_name == song2_name:
                print("Comparing to itself :(")
            if song1_name[0] == song2_name[0]:
                pass
            
            result = pd.concat([
                data_dict[song1_name],
                data_dict[song2_name]
                ], 
                keys=[0,1],
                axis=1, join='outer').fillna(0) #join_axes=[song1_data.index]
            result['dif'] = (abs(result[0] - result[1])).astype('float')
            res_list = [song1_name, song2_name, sum(result['dif'])]
            if res_list[2] != 0:
                comparing_data.loc[len(comparing_data)] = res_list
        except Exception as e:
            print("Issue with", song1_name, "or", song1_name)
            print("Error:", str(e))
            break
        comparing_counter+=1
        if (comparing_counter%100000==0 
            or comparing_counter==10 
            or comparing_counter==50
            or comparing_counter==100
            or comparing_counter==500
            or comparing_counter==1000
            or comparing_counter==10000
            or comparing_counter==50000
           ):
            print(str(datetime.now()) + " Compared " + str(comparing_counter) + " songs.")

    print(str(datetime.now()) + " Compared " + str(comparing_counter) + " pairs of songs.")
    print("Finished!")
    return comparing_data

# Should rewrite to use only one data structure
data_sample = {}
sample_size = None
if sample_size:
    for f,d in all_data.items():
        data_sample[f] = d
        sample_size-=1
        if sample_size <= 0:
            break
else:
    data_sample = all_data
            
comp = compare_songs(make_combinations(data_sample), data_sample)
print("\nThe lower value, the closer style of songs.")
#print(comp[0:3])

In [None]:
comp[0:3]
#dif_bands = comp.filter(comp['song1'].split("/")[0] != comp['song1'].split("/")[1]) 
comp.sort_values('dif').head(100).to_csv(path_or_buf=misuc_lib_path + "/top100.csv")

In [None]:
import networkx as nx
G = G=nx.from_numpy_matrix(comparing_data)
nx.draw(G)

## Product join of all elements

In [None]:
from pprint import PrettyPrinter
pp = PrettyPrinter(indent=4)


cut_path = len(misuc_lib_path)
cd = [(c[0][cut_path:], c[1][cut_path:], c[2]) for c in comparing_data]
sizes = [c[2] for c in cd]
size_buckets = {}
for s in sizes:
    ns = int(s/10)
    if ns in size_buckets:
        size_buckets[ns] += 1
    else:
        size_buckets[ns] = 1

pp.pprint(size_buckets)

In [None]:
len(top_simil)
top_simil.to_csv(index=True, path_or_buf=(misuc_lib_path + "/Comparing.csv"))

# Web parser

In [None]:

r = req.get('https://lyrics.fandom.com/wiki/3_Doors_Down')


In [None]:
print(r.content)

In [None]:
# Getting ganres
b1 = bands[0]
url = "https://lyrics.fandom.com/wiki/" + "_".join(b1.split(" "))
r = req.get(url)
print(r.content)

In [None]:
# Parsing out ganres
pq = PyQuery(r.content)
print(pq)

</div></div><div class="css-table-cell">
<p class="highlight"><b>Genres:</b></p><div>
<ul><li><a href="/wiki/Category:Genre/Alternative_Rock" title="Category:Genre/Alternative Rock">Alternative Rock</a>
</li><li><a href="/wiki/Category:Genre/Pop_Rock" title="Category:Genre/Pop Rock">Pop Rock</a>
</li><li><a href="/wiki/Category:Genre/Post-Grunge" title="Category:Genre/Post-Grunge">Post-Grunge</a>
</li></ul>
</div>