<a href="https://colab.research.google.com/github/amanteur/TDA_Cover_detection/blob/main/TDA_CoverDetection_Data_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gudhi

Collecting gudhi
[?25l  Downloading https://files.pythonhosted.org/packages/4e/02/84538da083305b7634886149331150399d6b5d9e4043852e4bfee3256468/gudhi-3.4.1.post1-cp37-cp37m-manylinux2014_x86_64.whl (28.2MB)
[K     |████████████████████████████████| 28.2MB 152kB/s 
Installing collected packages: gudhi
Successfully installed gudhi-3.4.1.post1


#Libraries

In [None]:
import gudhi as gd
import numpy as np
import matplotlib.pyplot as plt
import gudhi.representations
import os
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Extracting data

Getting a dataframe, which consists of name aka clique of songs and its cloud points

In [None]:
def get_data(cloud_points_type, short_size_datasets=False):

  input_dir = "/content/drive/MyDrive/Colab Notebooks/CourseWork/cloud_points_dataset/cloud_points_" + \
              cloud_points_type + '/'
  #make dict
  songs_dict = {}
  i = 0
  if short_size_datasets:
    root_dir = input_dir + '_dur_60__off_10'
  else:
    root_dir = input_dir + '_dur_None__off_0'
  for address, dirs, files in os.walk(root_dir):
    if address == input_dir:
      continue
    for file in files:
      songs_dict.setdefault(i,{})
      data = np.loadtxt(address + '/' + file, delimiter=' ')
      songs_dict[i]['name'] = address.rsplit('/', 1)[1]
      songs_dict[i]['data'] = data.T
      i += 1
    
  #make DataFrame
  songs_df = pd.DataFrame(songs_dict)
  return songs_df.transpose()

In [None]:
#all features cloud points full length songs
df_af = get_data('all')

In [None]:
#tonnetz cloud points full length songs
df_tf = get_data('tonnetz')

In [None]:
#mfcc cloud points full length songs
df_mf = get_data('mfcc')

In [None]:
#all features cloud points 60s length songs
df_as = get_data('all', short_size_datasets=True)

In [None]:
#tonnetz cloud points 60s length songs
df_ts = get_data('tonnetz', short_size_datasets=True)

In [None]:
#mfcc cloud points 60s length songs
df_ms = get_data('mfcc', short_size_datasets=True)

Example

In [None]:
df_mf.head()

Unnamed: 0,name,data
0,All_Tomorrow_s_Parties,"[[-3.103133201599121, 1.4269647598266602, 0.04..."
1,All_Tomorrow_s_Parties,"[[-3.2100024223327637, 0.8784309029579163, 0.0..."
2,Addicted_To_Love,"[[-3.0177416801452637, 1.3071744441986084, -0...."
3,Addicted_To_Love,"[[-2.3204026222229004, 1.7416863441467285, -1...."
4,All_Along_The_Watchtower,"[[-2.4056568145751953, 1.6976637840270996, -1...."


In [None]:
df_af.iloc[159].data.shape, df_tf.iloc[159].data.shape, df_mf.iloc[159].data.shape

((286, 30), (188, 6), (347, 12))

In [None]:
df_as.iloc[159].data.shape, df_ts.iloc[159].data.shape, df_ms.iloc[159].data.shape

((58, 30), (58, 6), (58, 12))

#Reduce data
Reducing data from full audios via PCA and choosing n farthest points **only for full audios**

In [None]:
import gudhi.subsampling as gds
from sklearn.decomposition import PCA

def reduce_data(df):
  df_size = df.shape[0]
  pca = PCA(n_components=2)
  feature_dict = {}
  for index, row in df.iterrows():
    if (index + 1) % 40 == 0:
      print('Progress: {}/{}'.format(index+1,df_size))
    song_data = np.array(row['data'])
    sparse_song_data = np.array(gds.choose_n_farthest_points(song_data, nb_points=120))
    sparse_song_data_pca = pca.fit_transform(sparse_song_data)
    # print(sparse_song_data.shape, pca.explained_variance_ratio_)
    # plt.scatter(sparse_song_data[:,0], sparse_song_data[:,1])
    # plt.show()
    feature_dict.setdefault(index, {})
    feature_dict[index]['sparse_data'] = sparse_song_data_pca
  dict_df = pd.DataFrame.from_dict(feature_dict).transpose()
  joined_df = df.join(dict_df)
  print('Done!\n')
  return joined_df

In [None]:
df_af = reduce_data(df_af)
df_tf = reduce_data(df_tf)
df_mf = reduce_data(df_mf)

Progress: 40/164
Progress: 80/164
Progress: 120/164
Progress: 160/164
Done!

Progress: 40/164
Progress: 80/164
Progress: 120/164
Progress: 160/164
Done!

Progress: 40/164
Progress: 80/164
Progress: 120/164
Progress: 160/164
Done!



Example

In [None]:
df_af.head()

Unnamed: 0,name,data,sparse_data
0,Blue_Collar_Man,"[[-4.582876612859176, 1.6642811155546193, -1.7...","[[1.8349494300835367, -0.37629420484752213], [..."
1,Blue_Collar_Man,"[[-4.508941001611552, 1.9314511059462562, -1.4...","[[-0.209837885003587, -0.22554664053336668], [..."
2,Addicted_To_Love,"[[-4.890550431132267, 1.8891033676869158, -0.4...","[[1.3351606727019654, -0.18788883166044545], [..."
3,Addicted_To_Love,"[[-3.7661504426205292, 2.629159154705121, -1.6...","[[0.16023083771694593, -0.42337137601792635], ..."
4,All_Tomorrow_s_Parties,"[[-5.055543636065286, 1.9339179479153203, -0.2...","[[0.27576475672799483, -0.03030909073657632], ..."


In [None]:
df_af.iloc[159].sparse_data.shape, df_tf.iloc[159].sparse_data.shape, df_mf.iloc[159].sparse_data.shape

((120, 2), (120, 2), (120, 2))

#Optimization
**Only tonnetz!**

In [None]:
!pip install tensorflow_addons

Collecting tensorflow_addons
[?25l  Downloading https://files.pythonhosted.org/packages/66/4b/e893d194e626c24b3df2253066aa418f46a432fdb68250cde14bf9bb0700/tensorflow_addons-0.13.0-cp37-cp37m-manylinux2010_x86_64.whl (679kB)
[K     |████████████████████████████████| 686kB 5.2MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.13.0


In [None]:
import tensorflow as tf
import tensorflow_addons as tfa

In [None]:
def Rips(DX, mel, dim, card):
    # Parameters: DX (distance matrix), 
    #             mel (maximum edge length for Rips filtration), 
    #             dim (homological dimension), 
    #             card (number of persistence diagram points, sorted by distance-to-diagonal)

    # Compute the persistence pairs with Gudhi
    rc = gd.RipsComplex(distance_matrix=DX, max_edge_length=mel)
    st = rc.create_simplex_tree(max_dimension=dim+1)
    dgm = st.persistence()
    pairs = st.persistence_pairs()

    # Retrieve vertices v_a and v_b by picking the ones achieving the maximal
    # distance among all pairwise distances between the simplex vertices
    indices, pers = [], []
    for s1, s2 in pairs:
        if len(s1) == dim+1:
            l1, l2 = np.array(s1), np.array(s2)
            i1 = [s1[v] for v in np.unravel_index(np.argmax(DX[l1,:][:,l1]),[len(s1), len(s1)])]
            i2 = [s2[v] for v in np.unravel_index(np.argmax(DX[l2,:][:,l2]),[len(s2), len(s2)])]
            indices += i1
            indices += i2
            pers.append(st.filtration(s2) - st.filtration(s1))
    
    # Sort points with distance-to-diagonal
    perm = np.argsort(pers)
    indices = list(np.reshape(indices, [-1,4])[perm][::-1,:].flatten())
    
    # Output indices
    indices = indices[:4*card] + [0 for _ in range(0,max(0,4*card-len(indices)))]
    return list(np.array(indices, dtype=np.int32))

In [None]:
class RipsModel(tf.keras.Model):
    def __init__(self, X, mel=12, dim=1, card=50):
        super(RipsModel, self).__init__()
        self.X = X
        self.mel = mel
        self.dim = dim
        self.card = card
        
    def call(self):
        m, d, c = self.mel, self.dim, self.card
        
        # Compute distance matrix
        DX = tfa.losses.metric_learning.pairwise_distance(self.X)
        DXX = tf.reshape(DX, [1, DX.shape[0], DX.shape[1]])
        
        # Turn numpy function into tensorflow function
        RipsTF = lambda DX: tf.numpy_function(Rips, [DX, m, d, c], [tf.int32 for _ in range(4*c)])
        
        # Compute vertices associated to positive and negative simplices 
        # Don't compute gradient for this operation
        ids = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(RipsTF,DXX,dtype=[tf.int32 for _ in range(4*c)]))
        
        # Get persistence diagram by simply picking the corresponding entries in the distance matrix
        dgm = tf.reshape(tf.gather_nd(DX, tf.reshape(ids, [2*c,2])), [c,2])
        return dgm

In [None]:
def optimize_point_clouds(df, column, n_pts=300, card=50, hom=1, ml=12., n_epochs=40):
  """

  params:
    df : pd.DataFrame
      initial dataframe
    column : str
      name of data column
    n_pts : int
      number of points in the point clouds
    card  : int
      max number of points in the diagrams
    hom   : int
      homological dimension
    ml    : int
      max distance in Rips
    n_epochs : int
      number of optimization steps

  returns:
    pd.Dataframe with added column of optimized cloud points
  """

  feature_dict = {}
  df_size = df.shape[0]

  for index, row in df.iterrows():
    if (index + 1) % 20 == 0:
      print('Progress: {}/{}'.format(index+1,df_size))

    song_data = row[column].astype(np.float32)
    
    #tensorflow part
    X = tf.Variable(initial_value=song_data, trainable=True)
    model = RipsModel(X=X, mel=ml, dim=hom, card=card)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)

    for epoch in range(n_epochs+1):
    
      with tf.GradientTape() as tape:
        # Compute persistence diagram
        dgm = model.call()
        
        # Loss is sum of squares of distances to the diagonal
        loss = -tf.math.reduce_sum(tf.square(.5*(dgm[:,1]-dgm[:,0])))
          
      # Compute and apply gradients
      gradients = tape.gradient(loss, model.trainable_variables)
      optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    # add to dict
    feature_dict.setdefault(index, {})
    feature_dict[index][column + '_optimized'] = model.X.numpy()

  dict_df = pd.DataFrame.from_dict(feature_dict).transpose()
  joined_df = df.join(dict_df)

  print('Done!')

  return joined_df

In [None]:
df_ts = optimize_point_clouds(df_ts, column='data')

In [None]:
df_tf = optimize_point_clouds(df_tf, column='sparse_data')

Progress: 20/164
Progress: 40/164
Progress: 60/164
Progress: 80/164
Progress: 100/164
Progress: 120/164
Progress: 140/164
Progress: 160/164
Done!


In [None]:
df_af.head()

Unnamed: 0,name,data,sparse_data
0,Blue_Collar_Man,"[[-4.582876612859176, 1.6642811155546193, -1.7...","[[1.8349494300835367, -0.37629420484752213], [..."
1,Blue_Collar_Man,"[[-4.508941001611552, 1.9314511059462562, -1.4...","[[-0.209837885003587, -0.22554664053336668], [..."
2,Addicted_To_Love,"[[-4.890550431132267, 1.8891033676869158, -0.4...","[[1.3351606727019654, -0.18788883166044545], [..."
3,Addicted_To_Love,"[[-3.7661504426205292, 2.629159154705121, -1.6...","[[0.16023083771694593, -0.42337137601792635], ..."
4,All_Tomorrow_s_Parties,"[[-5.055543636065286, 1.9339179479153203, -0.2...","[[0.27576475672799483, -0.03030909073657632], ..."


#Saving datasets

In [None]:
root_dir = '/content/drive/MyDrive/Colab Notebooks/CourseWork/data_datasets/'

In [None]:
df_as.to_json(root_dir + 'data_all_short.json')
df_ts.to_json(root_dir + 'data_tonnetz_short.json')
df_ms.to_json(root_dir + 'data_mfcc_short.json')

In [None]:
df_af.to_json(root_dir + 'data_all_full.json')
df_tf.to_json(root_dir + 'data_tonnetz_full.json')
df_mf.to_json(root_dir + 'data_mfcc_full.json')