In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Notes
### Import the notes file

In [None]:
df = pd.read_csv("./notes_total.csv", names=["note", "count", "title"], sep=";")
df.sort_values("count", ascending=False)

### Create the plots

In [None]:
grouped = df.groupby("note").sum().sort_values("count", ascending=False).reset_index()
natural = grouped["note"].isin(['A', 'B', 'C', 'D', 'E', 'F', 'G'])
sharp = grouped["note"].isin(['A#', 'B#', 'C#', 'D#', 'E#', 'F#', 'G#'])
flat = grouped["note"].isin(['Ab', 'Bb', 'Cb', 'Db', 'Eb', 'Fb', 'Gb'])
rest = ~natural & ~sharp & ~flat


plt.figure(figsize=(15,40))

plt.subplot(411)
plt.bar(x=grouped[natural]["note"], height = grouped[natural]["count"])
plt.title('Natural Notes')

plt.subplot(412)
plt.bar(x=grouped[sharp]["note"], height = grouped[sharp]["count"])
plt.title('Sharp Notes')

plt.subplot(413)
plt.bar(x=grouped[flat]["note"], height = grouped[flat]["count"])
plt.title('Flat Notes')

plt.subplot(414)
plt.bar(x=grouped[rest]["note"], height = grouped[rest]["count"])
plt.title('Other Notes')

plt.show()

# Chords
### Import the chords file

In [None]:
df = pd.read_csv("./chords_total.csv", names=["comb", "count", "title"], sep=";")
df.sort_values("count", ascending=False)

### Group by chord

In [None]:
gb = df.groupby("comb").sum()["count"].sort_values(ascending=False).reset_index()
gb

### Fit a curve to the data

Fit a Hurwitz zeta function to the data, 

\begin{equation*}
f(k,q,s) = \frac{C}{(k+q)^s}
\end{equation*}

according to Zipf–Mandelbrot law

In [None]:
from scipy.optimize import curve_fit

def func(x, a, b, c):
    return a /(x + b)**c


xdata = np.arange(1, gb.shape[0]+1)
ydata = gb["count"]

popt, pcov = curve_fit(func, xdata, ydata)
gb["fit"] = func(xdata, *popt)

### Create the plot

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
line1 = ax.scatter(xdata, ydata, label='Total Duration', c="tomato", linewidth=1.5)
ax.tick_params(direction="in", which="both")
ax.set_yscale('log')
ax.set_xscale('log')
line2 = ax.plot(xdata, gb["fit"], dashes=[3, 3, 10, 3], label='Fit', c="black", linewidth=1.5)

ax.legend()

# Clusters
### Import the files and merge them into one dataframe

In [None]:
df = pd.read_csv("./strings_total.csv", names=["title", "string"], sep=";")
df["num"] = df.title.apply(lambda x: x.split("_")[0][3:])

keys = pd.read_csv("./keys.csv", names=["num", "key"], sep=",")
df = df.merge(keys, on="num")
documents = df.string

### Train several models to determine the optimal k for k-means clustering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import silhouette_score


vectorizer = TfidfVectorizer(sublinear_tf=True, token_pattern=r"(?u)\S\S+", lowercase=False)
X = vectorizer.fit_transform(documents)

sil = []
distances = []
K = range(2,20)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X)
    labels = km.labels_
    distances.append(km.inertia_)
    sil.append(silhouette_score(X, labels, metric = 'euclidean'))


### Use elbow and silhouette methods to determine optimal k

Looking at the plots, there is no clear elbow, but we can see the slope changing at $k=8$. The silhouette plot further confirms this

In [None]:
plt.plot(K, distances, 'bo-')
plt.tick_params(direction="in")
plt.xlabel('k')
plt.ylabel('Sum of square distances')
plt.title('Elbow Method')
plt.show()

plt.plot(K, sil, 'bo-')
plt.tick_params(direction="in")
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method')
plt.show()

### Get clusters for $k=8$

In [None]:
true_k = 8
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=20)
model.fit(X)


prediction = model.predict(X)
df["cluster"] = prediction
print(df["cluster"].value_counts())

### Get the top terms of each cluster

In [None]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i)
    print(df[df["cluster"]==i]["key"].value_counts())
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind])