In [None]:
import pandas as pd
import nltk
import numpy as np
import pickle as pkl

from gensim.models.fasttext import load_facebook_vectors
from nltk.tokenize import word_tokenize

from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC 

In [None]:
songs = pd.read_csv("../resource/asnlib/publicdata/SingleLabel.csv")
songs

In [None]:
with open("../resource/asnlib/publicdata/subset.pkl", "rb") as f:
    embeddings = pkl.load(f)
embeddings

In [None]:
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

songs['lyrics'] = songs['lyrics'].apply(preprocess_text)
song_words = songs['lyrics'].tolist()

In [None]:
with open('subset.pkl', 'rb') as f:
    word_embeddings = pkl.load(f)

def calculate_avg_word_vector(lyrics, word_embeddings):
    num_features = len(next(iter(word_embeddings.values())))
    avg_vector = np.zeros(num_features)
    num_words = 0
    
    for word in lyrics:
        if word in word_embeddings:
            avg_vector += word_embeddings[word]
            num_words += 1
    
    if num_words > 0:
        avg_vector /= num_words
    
    return avg_vector

song_embeddings = []
for lyrics in song_words:
    avg_word_vector = calculate_avg_word_vector(lyrics, word_embeddings)
    song_embeddings.append(avg_word_vector)

X = np.array(song_embeddings)

In [None]:
param_grid = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
              'gamma': [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005],
              'kernel': ['poly', 'rbf']}

svm = SVC()

grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X, songs['label'])

print("Best Parameters:", grid_search.best_params_)

accuracy = grid_search.best_score_
print("Accuracy of the best model:", accuracy)

best_model = grid_search.best_estimator_

n_vecs = np.sum(best_model.n_support_)
print("Number of support vectors:", n_vecs)
