In [None]:
import pandas as pd
import numpy as np
import pretty_midi
import matplotlib.pyplot as plt
import math
import jdc

In [None]:
class Sonify:
  
  def read_data(self, path):
    """Read the data from the given csv file path."""
    if not path.endswith(".csv"): # If provided path is not a valid csv file
      print("Invalid file path. Must be .csv file.")
      raise FileNotFoundError()
    self.df = pd.read_csv(path) # Loads csv file

    # Constants used to add cumulative seconds values in function inside loop
    entries_per_day = 12*24 # 12 samples an hour (every 5 mins), 24 hours in a day
    seconds_per_day = 60*60*24

    # Constants used to create Kp comparison value
    self.density_max = np.max(self.df["proton_density"])
    self.speed_max = np.max(self.df["speed"])

    for i, row in self.df.iterrows(): # Iterates through rows, replacing error data with the average of the previous value and the next valid (i.e., non-error) value
      # Data cleanup
      self.__cleanup_column("proton_density", row, i, threshold=0)
      self.__cleanup_column("speed", row, i, threshold=0)
      self.__cleanup_column("ion_temp", row, i, threshold=0)
      self.__cleanup_column("bz", row, i, threshold=-10)
      self.__cleanup_column("phi_angle", row, i, threshold=0)

      # Adding column for seconds since start of dataset
      self.__cumulative_seconds(i, entries_per_day, seconds_per_day)

      # Adds a column for our own synthetic variable and a column for the difference between that variable and the Kp index.
      self.__kp_comparison(i, row) # Not working fully yet

    # Just for testing...
    plt.plot(self.df["cumulative_secs"], self.df["kp_index"])
    plt.title("KP Index over time")
    plt.show()

    plt.plot(self.df["cumulative_secs"], self.df["kp_equiv"])
    plt.title("KP Equiv over time")
    plt.show()

    plt.plot(self.df["cumulative_secs"], self.df["kp_diff"])
    plt.title("KP difference over time")
    plt.show()

    # print("Lowest density value:", np.min(self.df["proton_density"]))
    # print("Lowest speed value:", np.min(self.df["speed"]))

    print(self.df)
    # read the data into df
    # TODO check if path is ok, file format is ok, load data into dataframe
  
  def __cumulative_seconds(self, i, entries_per_day, seconds_per_day):
    """Adds a value for the amount of seconds passed since the start of the dataset for the current index."""
    current_day_in_dataset = math.floor(i / entries_per_day) # Gets the number of seconds to the start of the current day by rounding down
    self.df.at[i, "cumulative_secs"] = (current_day_in_dataset*seconds_per_day) + self.df.at[i, "sec_of_day"] # Adds the start of the current day and the seconds elapsed in current day.

  def __kp_comparison(self, i, row):
    """Generates a synthetic variable based on raw data which is compared to the kp index, the difference can
    be a variable we can map to something (maybe a filter parameter?)"""
    density_scaled = self.df.at[i, "proton_density"]/self.density_max # The density of the current entry scaled 0-1
    speed_scaled = self.df.at[i, "speed"]/self.speed_max # The speed of the current entry scaled 0-1
    phi_angle = self.df.at[i, "phi_angle"]

    if i == 0: # Edge case for the first index
      local_start_index = 0
      local_end_index = 2
    elif i == len(self.df.index)-1: # Edge case for the last index
      local_start_index = len(self.df.index) - 3
      local_end_index = len(self.df.index) - 1
    else: # All other indices
      local_start_index = i - 1
      local_end_index = i + 1

    local_phi_values = pd.Series.to_numpy(self.df.loc[local_start_index:local_end_index, "phi_angle"])
    local_time_values = pd.Series.to_numpy(self.df.loc[local_start_index:local_end_index, "cumulative_secs"])

    kp_equiv = density_scaled + speed_scaled * 9
    self.df.at[i, "kp_equiv"] = kp_equiv
    self.df.at[i, "kp_diff"] = row["kp_index"] - kp_equiv

  def __cleanup_column(self, column_title, row, i, threshold=0):
    """Cleans up the data by finding erroneous data, then setting it to the average of the previous cell and the next non-error cell in the specified column."""
    if row[column_title] < threshold: # If data is below threshold i.e., is an error
      next_valid_value = self.__find_next_non_error_cell(i, column_title, threshold) # Find the next non-error cell in the column
      self.df.at[i, column_title] = round((self.df.at[i-1, column_title] + next_valid_value)*0.5, 1) # Sets the current cell to the average of the previous cell and the next non-error one

  def __find_next_non_error_cell(self, i, column_title, threshold):
    """Finds the next value in a column above a given threshold using recursion."""
    next_value = self.df.at[i+1, column_title]
    if next_value > threshold: # If the next cell is above the threshold i.e., not an error, return the next cell
      return next_value
    else: # If the next cell is also an error, run the function again to try the next cell down
      return self.__find_next_non_error_cell(i+1, column_title, threshold)

  def read_corpus(self, path):
    pass
    # read midi and audio files, path should contain all files in folder
    # TODO check if path is ok, file formats are ok, ...

  def granulate(self, corpus):
    # Idea for how to reference grains:
    # Use another dataframe, then we can store Librosa features etc alongside each grain in a table
    # We can reference each grain by start and end indices in the numpy array of the audio

    self.grains_data = pd.DataFrame()

    for song in corpus:
      
      song_len_samp = song.size # Length of current song in samples
      total_grains_in_song = len(self.df.index) # Number of grains in song (just the length of the dataset)
      grain_len_samp = math.floor(song_len_samp / total_grains_in_song) # The length of each grain in samples
      
      for i, row in self.df.iterrows():
        self.grains_data["song_no"] = 0 # We'll need a column to tell which song a grain is coming from
        self.grain_data["grain_in_song"] = i # The current grain number within the current song
        grain_start_index = i * grain_len_samp # The index for the start sample of the current grain
        grain_end_index = (i * grain_len_samp) + grain_len_samp # The index for the end sample of the current grain
        self.grain_data["grain_pos"] = [grain_start_index, grain_end_index] # Adding grain location to dataframe as list
        # Any Librosa features could go here
        
      

In [None]:
# Use this syntax to add methods to Sonify class in other cells

%%add_to Sonify
def function(self):
  pass

In [None]:
sonify = Sonify()

sonify.read_data("/content/drive/MyDrive/Python Assignment 4 Depot/solar_wind_data_2003-10-27 - 2003-11-02_ACTUAL.csv")
sonify.read_corpus("/content/drive/MyDrive/Python Assignment 4 Depot/Corpus")