In [3]:
import pandas as pd
import numpy as np
import keras
import csv
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
# This function should transform data into a usable format 
def audioProcess(path):
    data = pd.read_csv(path)
    #get audio data from loudness to vect148
    #audioData = data.loc[:, "loudness":"vect_148"]
    
    #get audio data from vect_1 to vect148
    audioData = data.loc[:, "vect_1":"vect_148"]
    
    # normalization
    audioData=(audioData-audioData.mean())/audioData.std()
    
    return audioData.values.tolist()

In [5]:
# This function should transform data into a usable format 
def textProcess(path):
    data = pd.read_csv(path)
    #get audio data from vect1 to vect148
    textData = data.loc[:, "tags"]
    
    return textData.values.tolist()

In [6]:
# This function should transform data into a usable format 
def labelProcess(path):
    data = pd.read_csv(path)
    #get classes
    labels = data.loc[:,"genre"].values.tolist()
    
    
    return labels

In [7]:
def toOneHot(classes, labels):
    one_hot_labels = []

    for label in labels:
        one_hot_vector = [0] * len(classes)
        one_hot_vector[classes.index(label)] = 1

        one_hot_labels.append(one_hot_vector)
    return one_hot_labels

In [8]:
def train(trainData, trainLabel, model):
    
    batch_size = 32  #the number of data to feed into model per batch
    num_epoch = 50  # go through your training data epoch times
    
    #callbacks stop training if val_loss is not improving.
    callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose = 1)]
    
    history = model.fit(trainData, trainLabel, validation_split=0.2, epochs = num_epoch, batch_size=batch_size, verbose=1, callbacks = callbacks)
                 
    return history, model

In [9]:
def MLP(trainData, trainLabel):
    
    num_classes = len(trainLabel[0])  # total classes to train
    
    model = keras.models.Sequential()
    model.add(Dense(trainData.shape[1], input_shape = trainData.shape[1:], activation="relu", kernel_initializer="random_uniform"))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation="relu", kernel_initializer="random_uniform"))
    model.add(Dropout(0.5))
    model.add(Dense(500, activation="relu", kernel_initializer="random_uniform"))
    model.add(Dropout(0.5))
    model.add(Dense(100, activation="relu", kernel_initializer="random_uniform"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax", kernel_initializer="random_uniform"))

    #Define loss, optimizer, and metrics
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [10]:
def predict(testData, model):
    prediction = model.predict(testData)
    return prediction

In [11]:
def evaluate(test_label, prediction):
    accuracy = metrics.accuracy_score(test_label, prediction)
    return accuracy

DATA Preprocess

In [12]:
trainText = np.array(textProcess(r"D:\Unimelb\Introduction to Machine Learning\assignment2\dataset\dataset\train_features.csv"))
validText= np.array(textProcess(r"D:\Unimelb\Introduction to Machine Learning\assignment2\dataset\dataset\valid_features.csv"))
testText = np.array(textProcess(r"D:\Unimelb\Introduction to Machine Learning\assignment2\dataset\dataset\test_features.csv"))

In [13]:
#Create BOW
trainTxt = []
validTxt = []
testTxt = []
for i in range(len(trainText)):
    trainTxt.append(trainText[i].replace(",", ""))
    
for i in range(len(validText)):
    validTxt.append(validText[i].replace(",", ""))   
    
for i in range(len(testText)):
    testTxt.append(testText[i].replace(",", ""))   
    
vectorizer = CountVectorizer() 
trainVectors = vectorizer.fit_transform(trainTxt)
validVectors = vectorizer.transform(validTxt)
testVectors = vectorizer.transform(testTxt)

In [15]:
trainAudio = np.array(audioProcess(r"D:\Unimelb\Introduction to Machine Learning\assignment2\dataset\dataset\train_features.csv"))
validAudio = np.array(audioProcess(r"D:\Unimelb\Introduction to Machine Learning\assignment2\dataset\dataset\valid_features.csv"))
testAudio = np.array(audioProcess(r"D:\Unimelb\Introduction to Machine Learning\assignment2\dataset\dataset\test_features.csv"))

In [16]:
trainData = np.column_stack((trainVectors.toarray(),trainAudio))
validData = np.column_stack((validVectors.toarray(),validAudio))
testData = np.column_stack((testVectors.toarray(),testAudio))

In [17]:
trainLabel = labelProcess(r"D:\Unimelb\Introduction to Machine Learning\assignment2\dataset\dataset\train_labels.csv")
validLabel = labelProcess(r"D:\Unimelb\Introduction to Machine Learning\assignment2\dataset\dataset\valid_labels.csv")
#get classes
classes = list(set(trainLabel))

oneHotTrain = np.array(toOneHot(classes, trainLabel))
oneHotvalid = np.array(toOneHot(classes, validLabel))

Training

In [18]:
model = MLP(trainData, oneHotTrain)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 4975)              24755600  
_________________________________________________________________
dropout_1 (Dropout)          (None, 4975)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              4976000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 500)               500500    
_________________________________________________________________
dropout_3 (Dropout)  

In [19]:
history = train(trainData, oneHotTrain, model)
print("Finish training")

Instructions for updating:
Use tf.cast instead.
Train on 6142 samples, validate on 1536 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 00005: early stopping
Finish training


In [20]:
loss, accuracy = model.evaluate(validData, oneHotvalid)
print("loss, accuracy:", loss, accuracy)

loss, accuracy: 1.193007116317749 0.6333333333333333


Predict

In [21]:
trackID = pd.read_csv(r"D:\Unimelb\Introduction to Machine Learning\assignment2\dataset\dataset\test_features.csv").loc[:, "trackID"].values.tolist()
prediction = predict(testData, model)

In [22]:
prediction = prediction.tolist()
results_name = []

for result in prediction:
    results_name.append(classes[result.index(max(result))])
    
result = list(zip(trackID,results_name))

In [24]:
#file = "all_MLP_outPut.csv"
file = "allNoMeta_MLP_outPut.csv"

with open(file, 'w', newline='') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(["trackID", "genre"])
    writer.writerows(result)