In [42]:
DATASET = "swedish_tunes_int.csv"
TRANSLATION_FILE = "translation"
INPUT_SIZE = 5
HIDDEN_LAYERS = 0
NUM_LAYERS = 0
SEED = 0
LEARNING_RATE = 0.1

In [41]:
translation_p = open(TRANSLATION_FILE, 'r')
translation_lines = translation_p.readlines()
num_tokens = len(translation_lines)

translation_p.close()
num_tokens

1177

In [25]:
import pandas
from torch.nn import RNN, Linear, LogSoftmax, Module
import torch
import argparse
import math

In [None]:
# Class that creates the model that we'll use
class PredictRNN(Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PredictRNN, self).__init__()

        self.rnn = RNN(input_size, hidden_size)
        self.h2o = Linear(hidden_size, output_size)
        self.softmax = LogSoftmax(dim=1)

    def forward(self, line_tensor):
        rnn_out, hidden = self.rnn(line_tensor)
        output = self.h2o(hidden[0])
        output = self.softmax(output)
        return output

In [50]:
# Get min number of tokens in a line
def trim_small_lines(filepath, input_size):
    file_p = open(filepath, 'r')
    lines = list(file_p.readlines())
    for line in lines:
        line_tok = line.split(',')
        line_len = len(line_tok)
        if line_len < input_size:
            lines.remove(line)
    file_p.close()
    return lines

trim_small_lines(DATASET, INPUT_SIZE)[:5]

['0, 1, 2, 3, 4, 5, 3, 6, 3, 5, 3, 7, 5, 5, 5, 8, 5, 3, 5, 4, 7, 3, 4, 5, 3, 6, 3, 5, 3, 7, 5, 5, 5, 8, 9, 10, 11, 12, 13, 14, 13, 7, 5, 5, 5, 15, 15, 3, 16, 3, 7, 12, 13, 14, 13, 7, 5, 5, 5, 8, 9, 17, 18, \n',
 '0, 1, 2, 19, 20, 21, 22, 23, 6, 7, 24, 3, 15, 24, 12, 7, 6, 6, 23, 25, 26, 3, 12, 27, 7, 28, 23, 23, 22, 19, 7, 13, 24, 15, 28, 29, 7, 30, 20, 23, 24, 12, 31, 32, 33, 12, 12, 7, 6, 21, 30, 20, 34, 30, 20, 21, 7, 19, 20, 21, 22, 23, 6, 7, 24, 3, 15, 24, 12, 17, 18, \n',
 '0, 1, 2, 23, 6, 24, 15, 13, 19, 7, 34, 30, 20, 23, 35, 7, 6, 6, 23, 3, 15, 3, 16, 13, 36, 7, 25, 12, 6, 24, 6, 21, 7, 23, 6, 24, 15, 13, 19, 7, 34, 30, 20, 23, 35, 31, 20, 37, 38, 15, 6, 20, 34, 7, 39, 40, 40, 30, 34, 7, 41, 34, 30, 20, 27, 39, 7, 12, 21, 30, 42, 17, 18, \n',
 '0, 1, 2, 25, 4, 26, 13, 3, 26, 7, 3, 5, 8, 24, 9, 7, 6, 6, 30, 4, 8, 26, 5, 3, 3, 3, 3, 7, 8, 26, 5, 15, 3, 6, 6, 6, 7, 8, 26, 5, 3, 26, 3, 3, 3, 7, 3, 5, 8, 24, 9, 31, 12, 12, 32, 33, 7, 12, 43, 44, 7, 25, 4, 26, 13, 3, 26, 7, 3, 5, 8,

In [None]:
# calculates the accuracy of the predictions of a neural network
# For classification tasks
# NOTE: X and y should already be PyTorch tensors
def calculate_accuracy(network, X, y):
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        X = X.to(device)
        y = y.to(device)
    # make predictions for the given X
    probs = network(X)
    predictions = torch.argmax(probs, dim=1)
    # the calculate accuracy of those predictions
    total = len(predictions)
    accuracy = sum(predictions == y) / total
    variance = accuracy * (1 - accuracy)
    std_err = math.sqrt(variance / total)
    up_bound = accuracy + 2.39 * std_err
    low_bound = accuracy - 2.39 * std_err
    return float(accuracy), up_bound, low_bound


In [None]:
# converts a training set into smaller train and validation sets
def create_validation(training_X, training_y, valid_percentage):
    # find the split point between training and validation
    training_n = training_X.shape[0]
    valid_rows = int(valid_percentage * training_n)

    # create the validation set
    valid_X = training_X.iloc[:valid_rows]
    valid_y = training_y.iloc[:valid_rows]

    # create the (smaller) training set
    train_X = training_X.iloc[valid_rows:]
    train_y = training_y.iloc[valid_rows:]

    return train_X, train_y, valid_X, valid_y


In [None]:
# trains a neural network with given training data
def train_network(network, training_X, training_y, lr):
    # split the training data into train and validation
    # Note: use 20% of the original training data for validation
    train_X, train_y, valid_X, valid_y = create_validation(training_X, training_y, 0.2)

    # convert our data to PyTorch objects
    train_X = torch.from_numpy(train_X.values).long()
    train_y = torch.from_numpy(train_y.values).long()
    valid_X = torch.from_numpy(valid_X.values).long()
    valid_y = torch.from_numpy(valid_y.values).long()

    # move the data and model to the GPU if possible
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
        print(f"Using device {torch.cuda.get_device_name(device)}")
        train_X = train_X.to(device)
        train_y = train_y.to(device)
        valid_X = valid_X.to(device)
        valid_y = valid_y.to(device)
        network = network.to(device)

    # create the algorithm that learns the weight for the network
    optimizer = torch.optim.Adam(network.parameters(), lr=lr)

    # create the loss function that tells optimizer how much error it has in its predictions
    # here we use cross entropy since we have a classification task with more than two possible labels
    loss_function = torch.nn.CrossEntropyLoss()

    # train for 1000 epochs
    num_epochs = 1000
    train_loss_values = []
    valid_loss_values = []
    train_acc_values = []
    valid_acc_values = []
    for epoch in range(num_epochs):
        # make predictions on the training set and validation set
        train_predictions = network(train_X)
        valid_predictions = network(valid_X)
        train_loss = loss_function(train_predictions, train_y)

        # calculate the error on the training set
        train_loss_values.append(train_loss.item())
        valid_loss_values.append(loss_function(valid_predictions, valid_y).item())
        train_acc_values.append(calculate_accuracy(network, train_X, train_y))
        valid_acc = calculate_accuracy(network, valid_X, valid_y)
        valid_acc_values.append(valid_acc)

        # Early return for perfect fit
        if valid_acc == 1:
            # convert the training progress data to a Pandas DataFrame
            progress = {
                "epoch": range(epoch+1),
                "train_loss": train_loss_values,
                "valid_loss": valid_loss_values,
                "train_acc": train_acc_values,
                "valid_acc": valid_acc_values
            }
            return pandas.DataFrame(progress)

        # perform backpropagation
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

    # convert the training progress data to a Pandas DataFrame
    progress = {
        "epoch": range(num_epochs),
        "train_loss": train_loss_values,
        "valid_loss": valid_loss_values,
        "train_acc": train_acc_values,
        "valid_acc": valid_acc_values
    }

    return pandas.DataFrame(progress)


In [47]:
def process_dataset(dataset_filepath, input_size):
    lines = trim_small_lines(dataset_filepath, input_size)
    # We need to create a dataframe with the proper number of columns
    # Get the first three tokens from every line and store them in front of each instance
    dataframe_dict = {
        'timesig': [],
        'key': [],
        'style': []
    }

    for i in range(input_size):
        dataframe_dict[i] = []

    for line in lines:
        line = line.split(',')
        timesig = line[0]
        key = line[1]
        style = line[2]
        remainder = line[3:]
        for i in range(len(remainder) - input_size):
            window = remainder[i:i+input_size]
            dataframe_dict['timesig'].append(timesig)
            dataframe_dict['key'].append(key)
            dataframe_dict['style'].append(style)
            for ind, token in enumerate(window):
                dataframe_dict[ind].append(token)

    return pandas.DataFrame(dataframe_dict)

process_dataset(DATASET, INPUT_SIZE)

Unnamed: 0,timesig,key,style,0,1,2,3,4
0,0,1,2,3,4,5,3,6
1,0,1,2,4,5,3,6,3
2,0,1,2,5,3,6,3,5
3,0,1,2,3,6,3,5,3
4,0,1,2,6,3,5,3,7
...,...,...,...,...,...,...,...,...
914069,0,1,801,7,44,28,28,7
914070,0,1,801,44,28,28,7,77
914071,0,1,801,28,28,7,77,122
914072,0,1,801,28,7,77,122,63


In [49]:
# Splits a dataset into training and test sets,
def split_dataset(dataset, train_data_ratio, y_label, seed=SEED):
    training = dataset.sample(frac=train_data_ratio, random_state=seed)
    test = dataset.drop(training.index)
    training_y = training[y_label]
    training_X = training.drop(columns=y_label)
    test_y = test[y_label]
    test_X = test.drop(columns=y_label)
    return training_X, training_y, test_X, test_y

dataset = process_dataset(DATASET, INPUT_SIZE)
training_X, training_y, test_X, test_y = split_dataset(dataset, 0.9, 4, 0)
training_X


Unnamed: 0,timesig,key,style,0,1,2,3
95331,129,1,516,54,66,441,7
284510,129,1,173,101,642,51,50
763572,129,1,1021,19,3,15,73
755451,0,1,702,3,62,122,4
571943,129,1,726,16,44,44,7
...,...,...,...,...,...,...,...
479845,129,180,173,5,16,188,7
897033,129,1,1120,3,20,19,27
715963,129,180,173,7,86,49,68
184943,0,1,700,19,7,30,20


In [37]:
# Performs one-hot encoding on dataset AFTER process_dataset
# Napkin estimates this would take 27 hours to run on swedish dataset
def onehots(dataframe, n_tokens):
    ret_frame = pandas.DataFrame()
    for column_name, column in dataframe.items():
        oh_dict = {}
        for ix in range(n_tokens):
            oh_dict[str(column_name) + '_' + str(ix)] = []
        for item in column:
            print(item)
            for key, array in oh_dict.items():
                if item == int(key.split('_')[1]):
                    array.append(1)
                else:
                    array.append(0)
        ret_frame = pandas.concat([ret_frame, pandas.DataFrame(oh_dict)], axis=1)
    return ret_frame

In [39]:
rnn = PredictRNN(INPUT_SIZE+2, HIDDEN_LAYERS, 1)
results = train_network(rnn, training_X, training_y)

Unnamed: 0,timesig,key,style,0,1,2,3,4
0,0,1,2,3,4,5,3,6
1,0,1,2,4,5,3,6,3
2,0,1,2,5,3,6,3,5
3,0,1,2,3,6,3,5,3
4,0,1,2,6,3,5,3,7
