# Finding the Feeling: Identifying the Key Emotions of a song

In [2]:
import pandas as pd
import numpy as np

## Lyrics Preprocessing: To lower, 

In [8]:
import nltk
import os
import string
import re
# create new folder to hold the preprocessed data
# create new text files with the preprocessed data 
# create three functions to do upper to lower, 

rawTextDataDir = "dataset/Lyrics/"
preprocessedDataDir = "preprocessedData/Lyrics/"

#iterate over all files in rawText and create an associated proccessed file in preprocessed dir

# to lower
for fileName in os.listdir(rawTextDataDir):
    
    with open(rawTextDataDir + fileName, 'r') as data:
        newFileName = "processed" + fileName
        
        with open (preprocessedDataDir + newFileName, 'a') as newData:
            for line in data:
                newData.write(line.lower())
            newData.close();
    data.close()




In [9]:
# punctuatuon removal 

for fileName in os.listdir(preprocessedDataDir):
    with open(preprocessedDataDir + fileName, 'r') as file:
        data = file.read()
    data = data.translate(str.maketrans("","", string.punctuation))
    with open(preprocessedDataDir + fileName, 'w') as file:
        file.write(data)
            


In [10]:
#stop word removal
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
for fileName in os.listdir(preprocessedDataDir):
    with open(preprocessedDataDir + fileName) as file:
        data = file.read()
    tokens = nltk.word_tokenize(data)
    filtered_tokens = [word for word in tokens if not word.lower() in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    with open(preprocessedDataDir + fileName, 'w') as file:
            file.write(filtered_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/markpolkhovskiy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/markpolkhovskiy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Lemmatization 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


for fileName in os.listdir(preprocessedDataDir):
    with open(preprocessedDataDir + fileName) as file:
        data = file.read()
    tokens = nltk.word_tokenize(data)
    for (index,word) in enumerate(tokens):
        tokens[index] = lemmatizer.lemmatize(word)
        #tokens[index] = stemmer.stem(word)
    lemmatized_tokens = ' '.join(tokens)
    with open(preprocessedDataDir + fileName, 'w') as file:
        file.write(lemmatized_tokens);


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/markpolkhovskiy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/markpolkhovskiy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Audio processing

In [70]:
import librosa

### Define functions to exatract features and combine them

In [71]:
def extractLoudness(signal):
    df_loudness = pd.DataFrame()
    S, phase = librosa.magphase(librosa.stft(signal))
    rms = librosa.feature.rms(S=S)
    df_loudness['Loudness'] = rms[0]
    return df_loudness

def extractMFCCS(signal, sample_rate):
    df_mfccs = pd.DataFrame()
    mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=12)
    for n_mfcc in range(len(mfccs)):
        df_mfccs['MFCC_%d'%(n_mfcc+1)] = mfccs.T[n_mfcc]
    return df_mfccs

def extractZeroCrossingRate(signal):
    df_zero_crossing_rate = pd.DataFrame()
    zcr = librosa.feature.zero_crossing_rate(y=signal)
    df_zero_crossing_rate['ZCR'] = zcr[0]
    return df_zero_crossing_rate

def extractChroma(signal, sample_rate):
    df_chroma = pd.DataFrame()
    chromagram = librosa.feature.chroma_stft(y=signal, sr=sample_rate)
    for n_chroma in range(len(chromagram)):
        df_chroma['Chroma_%d'%(n_chroma+1)] = chromagram.T[n_chroma]
    return df_chroma

def extractMelSpectrogram(signal, sample_rate):
    df_mel_spectrogram = pd.DataFrame()
    mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sample_rate, n_mels=12)
    for n_mel in range(len(mel_spectrogram)):
        df_mel_spectrogram['Mel_Spectrogram_%d'%(n_mel+1)] = mel_spectrogram.T[n_mel]
    return df_mel_spectrogram

def matrixToVector(matrixFeatures):
    vector = []
    for label in matrixFeatures.columns:
        if label == 'Loudness' or label == 'ZCR':
            vector.append(np.mean(matrixFeatures[label].dropna().to_numpy()))
        else:
            vector.extend(matrixFeatures[label].dropna().to_numpy())
    return vector

### Extract features and flatten it

In [72]:
audioDir = 'dataset/Audio'
features_vectors = []

for file in os.listdir(audioDir):
    signal, rate = librosa.load(f'{audioDir}/{file}')
    matrix = pd.concat([
        extractLoudness(signal), 
        extractMFCCS(signal, rate), 
        extractZeroCrossingRate(signal),
        extractChroma(signal, rate),
        extractMelSpectrogram(signal, rate)])
    features_vectors.append(matrixToVector(matrix))

KeyboardInterrupt: 

### Export data into a csv

In [15]:
features_vectors = np.asarray(features_vectors)
np.savetxt('audio_features.csv', features_vectors, delimiter=',')

### Load audio feature data from csv

In [73]:
import csv
reader = csv.reader(open("audio_features.csv", "r"), delimiter=",")
x = list(reader)
features = np.array(x).astype("float")

# Post-Processing + Model Building + Model Eval

In [355]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, roc_curve, auc, classification_report, accuracy_score

### Read Audio Features to DataFrame from CSV

In [356]:
audioFeatures = pd.read_csv('audio_features.csv')

### Get Clusters 

In [357]:
clusters = []
with open('./dataset/clusters.txt') as file:
    for line in file:
        clusters.append(line.split()[1])
combined = pd.DataFrame({"clusters":clusters, 'text':np.nan})

### Get Text Data

In [358]:
import os
path = "./preprocessedData/Lyrics/"
text = pd.DataFrame()
combined['text'] = combined['text'].astype('string')
for i in os.listdir(path):
    textfile = open(path+i)
    combined.at[int(i[9:12])-1,'text'] = "".join(textfile.readlines())#.split(" ")
    textfile.close()

### Combine and fill na cells

In [359]:
combined_final = pd.concat([combined, audioFeatures.set_axis(combined.index)], axis=1)
combined_final = combined_final.fillna("")

### Scale the audio data by column (feature) and average by row (file)

In [361]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))

for i in range (0, 433):
    combined_final['a'+str(i)] = (scaler.fit_transform(combined_final['a'+str(i)].to_numpy().reshape(-1,1)))

cols = combined_final.columns.drop(['clusters','text'])
combined_final['a0'] = combined_final.loc[:, cols].mean(axis=1)

combined_final = combined_final.loc[:, ["clusters","text", "a0"]]

### Split Data

In [365]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scale = StandardScaler() #from sklearn

x = combined_final.drop('clusters', axis=1)
y = combined_final.loc[:, ['clusters']]

#split dataset into training dataset and test dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=8, shuffle=True)

### Getting TF*IDF from text pre-processing

In [366]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

#train
tfidf_vector = tfidf.fit_transform(x_train.loc[:,'text'])
tfidf_array = tfidf_vector.toarray()

#test
tfidf_vector_test = tfidf.transform(x_test.loc[:,'text'])
tfidf_array_test = tfidf_vector_test.toarray()

#to dataframes
tfdf = pd.DataFrame(tfidf_array)
tfdf_test = pd.DataFrame(tfidf_array_test)

### Getting word2Vec from text pre-processing

In [367]:
import gensim
from gensim.models import Word2Vec

numpy_train = x_train.loc[:,'text'].to_numpy()
numpy_test= x_test.loc[:,'text'].to_numpy()

trained_w2v = [sentence.split() for sentence in numpy_train]
w2v = gensim.models.Word2Vec(trained_w2v, min_count = 40, vector_size = 100, window = 5, sg=0, workers=4)

#vectorize the data
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v.wv[word] for word in words if word in w2v.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)
    
x_train_w2v_vect = pd.DataFrame(np.array([vectorize(sentence) for sentence in numpy_train]))
x_test_w2v_vect = pd.DataFrame(np.array([vectorize(sentence) for sentence in numpy_test]))

### concat all features 

In [368]:
x_train = pd.concat([x_train, tfdf.set_axis(x_train.index), x_train_w2v_vect.set_axis(x_train.index)], axis=1)
x_train = x_train.drop('text', axis=1)
#x_train = x_train.rename({'0': 'audio'}, axis=1)
x_train.columns = x_train.columns.astype(str)

x_test = pd.concat([x_test, tfdf_test.set_axis(x_test.index), x_test_w2v_vect.set_axis(x_test.index)], axis=1)
x_test = x_test.drop('text', axis=1)
#x_test = x_test.rename({'0': 'audio'}, axis=1)
x_test.columns = x_test.columns.astype(str)

### Model Training and Testing

In [369]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [370]:
# modelSelector holds all of the different models we will use to train the data
modelSelector = {
    'logistic regression' : LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Classifier': SVC(kernel='linear')
}

for modelName, model in modelSelector.items():
    model.fit(x_train, y_train) 
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{modelName} Accuracy: {accuracy:.2f}")  # helps with analysis, we may wanna move this to further modulize implementation
    print(classification_report(y_test, y_pred, target_names=model.classes_))
    print()

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


logistic regression Accuracy: 0.37
              precision    recall  f1-score   support

           1       0.19      0.15      0.17        48
           2       0.42      0.22      0.29        51
           3       0.41      0.75      0.53        55
           4       0.45      0.30      0.36        63
           5       0.34      0.43      0.38        54

    accuracy                           0.37       271
   macro avg       0.36      0.37      0.34       271
weighted avg       0.37      0.37      0.35       271


Decision Tree Accuracy: 0.28
              precision    recall  f1-score   support

           1       0.19      0.21      0.20        48
           2       0.21      0.16      0.18        51
           3       0.36      0.51      0.42        55
           4       0.32      0.29      0.30        63
           5       0.27      0.22      0.24        54

    accuracy                           0.28       271
   macro avg       0.27      0.28      0.27       271
weighted avg

  return self._fit(X, y)


KNN Accuracy: 0.31
              precision    recall  f1-score   support

           1       0.21      0.29      0.25        48
           2       0.31      0.29      0.30        51
           3       0.32      0.44      0.37        55
           4       0.38      0.32      0.35        63
           5       0.37      0.20      0.26        54

    accuracy                           0.31       271
   macro avg       0.32      0.31      0.31       271
weighted avg       0.32      0.31      0.31       271




  y = column_or_1d(y, warn=True)


Naive Bayes Accuracy: 0.28
              precision    recall  f1-score   support

           1       0.18      0.19      0.18        48
           2       0.18      0.25      0.21        51
           3       0.40      0.35      0.37        55
           4       0.34      0.41      0.37        63
           5       0.35      0.17      0.23        54

    accuracy                           0.28       271
   macro avg       0.29      0.27      0.27       271
weighted avg       0.30      0.28      0.28       271




  model.fit(x_train, y_train)


Random Forest Accuracy: 0.35
              precision    recall  f1-score   support

           1       0.33      0.31      0.32        48
           2       0.30      0.14      0.19        51
           3       0.34      0.78      0.48        55
           4       0.33      0.27      0.30        63
           5       0.44      0.22      0.30        54

    accuracy                           0.35       271
   macro avg       0.35      0.34      0.32       271
weighted avg       0.35      0.35      0.32       271




  y = column_or_1d(y, warn=True)


Support Vector Classifier Accuracy: 0.35
              precision    recall  f1-score   support

           1       0.20      0.19      0.19        48
           2       0.37      0.20      0.26        51
           3       0.38      0.60      0.46        55
           4       0.41      0.27      0.33        63
           5       0.36      0.46      0.40        54

    accuracy                           0.35       271
   macro avg       0.34      0.34      0.33       271
weighted avg       0.35      0.35      0.33       271


