In [None]:
import pandas as pd
import nltk
import numpy as np
import pickle as pkl

from gensim.models.fasttext import load_facebook_vectors
from nltk.tokenize import word_tokenize

from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC 

In [None]:
songs = pd.read_csv("../resource/asnlib/publicdata/SingleLabel.csv")
songs

In [None]:
with open("../resource/asnlib/publicdata/subset.pkl", "rb") as f:
    embeddings = pkl.load(f)
embeddings

In [None]:
from nltk.corpus import stopwords

# tokenizing words
song_words_unfiltered = []
song_words = []

lyrics = songs["lyrics"].to_numpy()

for song in lyrics:
    tokenized = word_tokenize(song)
    
    # convert to lowercase
    lowered = []
    for word in tokenized:
        lowered.append(word.lower())
    song_words_unfiltered.append(lowered)

# removing stopwords
stop_words = set(stopwords.words('english'))

for song in song_words_unfiltered:
    song_filtered = []
    for word in song:
        if word not in stop_words:
            song_filtered.append(word)
    song_words.append(song_filtered)

In [None]:
# each song
def avg(song, embeddings):
    word_vecs = []
    # each word in song
    for word in song:
        # add word vector or zero vector
        if word in embeddings.keys():
            word_vecs.append(embeddings[word])
        else:
            word_vecs.append(np.zeros(300))
    # average word vectors, collect averages
    return np.mean(word_vecs, axis=0)

X = np.array([avg(song, embeddings) for song in song_words])

In [None]:
from sklearn.metrics import accuracy_score

# target mood labels
y = songs["label"]

# create SVMs
parameters = {
    'C': [.00001, .0001, .001, .01, .1, 1],
    'gamma': [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005],
    'kernel': ['poly', 'rbf']
}

svm = SVC()
clf = GridSearchCV(svm, parameters, cv=5, scoring='accuracy', verbose=1)

# finding accuracy and number of support vectors of best model <3
clf.fit(X, y)
y_pred = clf.predict(X)
accuracy = accuracy_score(y, y_pred)
n_vecs = clf.best_estimator_.n_support_.sum()

print("Accuracy:", accuracy)
print("Number of support vectors:", n_vecs)