# Finding the Feeling: Identifying the Key Emotions of a song

In [7]:
import pandas as pd
import numpy as np

## Lyrics Preprocessing: To lower, 

In [8]:
import nltk
import os
import string
import re
# create new folder to hold the preprocessed data
# create new text files with the preprocessed data 
# create three functions to do upper to lower, 

rawTextDataDir = "dataset/Lyrics/"
preprocessedDataDir = "preprocessedData/Lyrics/"

#iterate over all files in rawText and create an associated proccessed file in preprocessed dir

# to lower
for fileName in os.listdir(rawTextDataDir):
    
    with open(rawTextDataDir + fileName, 'r') as data:
        newFileName = "processed" + fileName
        
        with open (preprocessedDataDir + newFileName, 'a') as newData:
            for line in data:
                newData.write(line.lower())
            newData.close();
    data.close()




In [9]:
# punctuatuon removal 

for fileName in os.listdir(preprocessedDataDir):
    with open(preprocessedDataDir + fileName, 'r') as file:
        data = file.read()
    data = data.translate(str.maketrans("","", string.punctuation))
    with open(preprocessedDataDir + fileName, 'w') as file:
        file.write(data)
            


In [10]:
#stop word removal
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
for fileName in os.listdir(preprocessedDataDir):
    with open(preprocessedDataDir + fileName) as file:
        data = file.read()
    tokens = nltk.word_tokenize(data)
    filtered_tokens = [word for word in tokens if not word.lower() in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    with open(preprocessedDataDir + fileName, 'w') as file:
            file.write(filtered_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/markpolkhovskiy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/markpolkhovskiy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Lemmatization 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


for fileName in os.listdir(preprocessedDataDir):
    with open(preprocessedDataDir + fileName) as file:
        data = file.read()
    tokens = nltk.word_tokenize(data)
    for (index,word) in enumerate(tokens):
        tokens[index] = lemmatizer.lemmatize(word)
        #tokens[index] = stemmer.stem(word)
    lemmatized_tokens = ' '.join(tokens)
    with open(preprocessedDataDir + fileName, 'w') as file:
        file.write(lemmatized_tokens);


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/markpolkhovskiy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/markpolkhovskiy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Audio processing

In [12]:
import librosa

### Define functions to exatract features and combine them

In [13]:
def extractLoudness(signal):
    df_loudness = pd.DataFrame()
    S, phase = librosa.magphase(librosa.stft(signal))
    rms = librosa.feature.rms(S=S)
    df_loudness['Loudness'] = rms[0]
    return df_loudness

def extractMFCCS(signal, sample_rate):
    df_mfccs = pd.DataFrame()
    mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=12)
    for n_mfcc in range(len(mfccs)):
        df_mfccs['MFCC_%d'%(n_mfcc+1)] = mfccs.T[n_mfcc]
    return df_mfccs

def extractZeroCrossingRate(signal):
    df_zero_crossing_rate = pd.DataFrame()
    zcr = librosa.feature.zero_crossing_rate(y=signal)
    df_zero_crossing_rate['ZCR'] = zcr[0]
    return df_zero_crossing_rate

def extractChroma(signal, sample_rate):
    df_chroma = pd.DataFrame()
    chromagram = librosa.feature.chroma_stft(y=signal, sr=sample_rate)
    for n_chroma in range(len(chromagram)):
        df_chroma['Chroma_%d'%(n_chroma+1)] = chromagram.T[n_chroma]
    return df_chroma

def extractMelSpectrogram(signal, sample_rate):
    df_mel_spectrogram = pd.DataFrame()
    mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sample_rate, n_mels=12)
    for n_mel in range(len(mel_spectrogram)):
        df_mel_spectrogram['Mel_Spectrogram_%d'%(n_mel+1)] = mel_spectrogram.T[n_mel]
    return df_mel_spectrogram

def matrixToVector(matrixFeatures):
    vector = []
    for label in matrixFeatures.columns:
        if label == 'Loudness' or label == 'ZCR':
            vector.append(np.mean(matrixFeatures[label].dropna().to_numpy()))
        else:
            vector.extend(matrixFeatures[label].dropna().to_numpy())
    return vector

### Extract features and flatten it

In [14]:
audioDir = 'dataset/Audio'
features_vectors = []

for file in os.listdir(audioDir):
    signal, rate = librosa.load(f'{audioDir}/{file}')
    matrix = pd.concat([
        extractLoudness(signal), 
        extractMFCCS(signal, rate), 
        extractZeroCrossingRate(signal),
        extractChroma(signal, rate),
        extractMelSpectrogram(signal, rate)])
    features_vectors.append(matrixToVector(matrix))

### Export data into a csv

In [15]:
features_vectors = np.asarray(features_vectors)
np.savetxt('audio_features.csv', features_vectors, delimiter=',')

### Load audio feature data from csv

In [21]:
import csv
reader = csv.reader(open("audio_features.csv", "r"), delimiter=",")
x = list(reader)
features = np.array(x).astype("float")