# Song_Data_Preprocessing
## Overview
This module will take in a text file that has over 2000 4-chord progressions. It will then manufacture a training data set out of it for predicting a fourth chord (an integer from 1-7) based on the preceeding 3. This will be achieved using first a scikit-learn model, then with a PyTorch model. 

## Details
The original dataset file is a text file with 2000 4-chord progressions separated by periods.

First, each 4-chord progression will be copied 3-4 times in a row to get across the repeatability/looping aspect of each chord progression. Then all of the 3-4 repeated progressions will be appended. The dataset will finally be constructed by taking the first 3 as the X and the fourth as the Y. The first value will be popped and the process will be repeated.

The input to the model will be a 3-long vector and the output will be a 1-hot 7-long vector (or an integer depending on the model).

In [2]:
# Import box
import csv
from random import shuffle

import numpy as np

# Torch stuff
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

from torch.autograd import Variable

In [5]:
# Getting PSV (period separated file) in
csvIn = csv.reader(open('../dataset/ChordProgressionsAll.csv'))
all_data = []
for row in csvIn:
    all_data.append(row)
    
    
# a = all_data[200]
# b = all_data[201]
# a += b
# print(a)

shuffle(all_data)

for i in range(len(all_data)):
    tmp = all_data[i]
    all_data[i] += tmp
    all_data[i] += tmp
    
super_list = []

for i in range(len(all_data)):
    super_list += all_data[i]
    
for i in range(len(super_list)):
    super_list[i] = int(super_list[i])

# print(super_list)

train_x_list = [];
train_y_list = [];

while(len(super_list) > 4):
    train_x_list += [super_list[0:4]]
#     train_y_list.append(super_list[3])
    super_list.pop(0)
    
shuffle(train_x_list)


# with open('../exports/processed.csv', 'w') as myfile:
#     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#     wr.writerow(train_x_list)

with open("../exports/processed.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(train_x_list)

train_y_list = train_x_list[:][3]
    
# print(len(train_x_list))
# print(len(train_y_list))

# total_entries = len(train_x_list)


In [207]:
class PredNet(nn.Module):
    def __init__(self, input_size, hidden1_size, hidden2_size, num_classes):
        super(PredNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden1_size, hidden2_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden2_size, num_classes)
    
    def forward(self, x): 
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        _,out1 = out.max(0)
        print("TYPE OF OUT1 iS: ",type(out1))
        out1 = torch.FloatTensor(out)
        return out1


In [208]:
# def accuracy(out, labels):
#     outputs = np.argmax(out, axis=1)
#     return np.sum(outputs==labels)/float(labels.size)

In [209]:
model = PredNet(3, 40, 40, 7)

In [213]:
# Making np arrays out of lists
train_x_np = np.array(train_x_list[0:25000])
train_y_np = np.array(train_y_list[0:25000])

val_x_np = np.array(train_x_list[25001:len(train_x_list)-1])
val_y_np = np.array(train_y_list[25001:len(train_y_list)-1])

print(train_x_np)
print(train_y_np)

[[4 5 6]
 [5 6 2]
 [6 2 4]
 ...
 [4 1 2]
 [1 2 6]
 [2 6 4]]
[2 4 5 ... 6 4 1]


In [217]:
# Training scikit NN models
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(6), random_state=112)
clf.fit(train_x_np, train_y_np)

skOut = clf.predict(val_x_np)

correct = sum(skOut == val_y_np)
print(correct/len(skOut))

0.1278150633855332


In [212]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_x_np, train_y_np)
reg.score(train_x_np, train_y_np)

0.000247041143298099

In [186]:
# Making tensors out of np arrays
train_x_tensor = torch.from_numpy(train_x_np)
train_y_tensor = torch.from_numpy(train_y_np)
# train_y_tensor = torch.zeros([train_x_tensor.size()[0]+1, 7], dtype=torch.int32)

print(train_y_tensor)

val_x_tensor = torch.from_numpy(val_x_np)
val_y_tensor = torch.from_numpy(val_y_np)

tensor([3, 4, 4,  ..., 7, 5, 7])


In [187]:
# Making training Variables
Variable_X = Variable(train_x_tensor);
Variable_Y = Variable(train_y_tensor);

In [188]:
# Learning info and parameters
loss_fn = nn.MSELoss()
learning_rate = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate, nesterov=True, momentum = 0.9, dampening = 0)
num_epochs = 1

train_loss = []
test_loss = []
train_accuracy = []
test_accuracy = []

In [189]:
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    print(train_x_tensor.size())
    
    pred_y = model(train_x_tensor.float())
    
    print(pred_y.size())
    
    print(type(pred_y))
    print(pred_y)
    loss = loss_fn(pred_y, train_y_tensor)
    
    loss.backward()
    optimizer.step()
    
    model.eval()
    train_loss.append(float(loss.data))
    print((loss))

torch.Size([25000, 3])
TYPE OF OUT1 iS:  <class 'torch.Tensor'>
torch.Size([25000, 7])
<class 'torch.Tensor'>
tensor([[ 0.1311, -0.0520,  0.8611,  ..., -0.3829, -0.5027, -0.2835],
        [ 0.3176, -0.0618,  0.6982,  ..., -0.3394, -0.3013, -0.2981],
        [ 0.2970,  0.0757,  0.6429,  ..., -0.3033, -0.2301, -0.2643],
        ...,
        [ 0.0901, -0.0474,  1.4953,  ..., -0.5803, -0.7415, -0.3428],
        [ 0.6206,  0.0101,  1.4023,  ..., -0.6212, -0.5321, -0.6825],
        [ 0.2255,  0.1101,  1.4520,  ..., -0.5934, -0.4968, -0.3325]],
       grad_fn=<SliceBackward>)


RuntimeError: The size of tensor a (7) must match the size of tensor b (25000) at non-singleton dimension 1