# Data Analysis Steps (to be packaged)

## 1. Imports

In [34]:
# GENERAL
import pandas as pd
import numpy as np

# Sklearn 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer

# Language processing
import nltk
from langdetect import detect
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping

## 2. Dataset

In [35]:
# df1 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s1.csv')
# df2 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s2.csv')
# df3 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s3.csv')
# df4 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s4.csv')
# df5 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s5.csv')
# df6 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s6.csv')
# df7 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s7.csv')
# df8 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s8.csv')
# df9 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s9.csv')
# df10 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s10.csv')
# df11 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s11.csv')
# frames = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11]
# data = pd.concat(frames)
data = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_total.csv')
data.shape

(114000, 24)

In [36]:
data = data[data['lyrics_extracted']!='999']
data.shape

(39710, 24)

In [37]:
data = data[data['lyrics_language']=='en'] # we will have to decide whether to translate non english or use only english songs

In [5]:
data.shape

(22833, 24)

## 3. Preprocessing steps and feature engineering

In [7]:
# TEXT PREPROCESSING
import unicodedata
import re 

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    # function to remove accented characters
    def remove_accented_chars(txt):
        new_text = unicodedata.normalize('NFKD', txt).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return new_text
    sentence = remove_accented_chars(sentence)
    
    tokenized_sentence = nltk.word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]
    
    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)
    
    return cleaned_sentence

In [8]:
data['cleaned_lyrics'] = data["lyrics_extracted"].apply(cleaning) # add signs (musical note) removal
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,lyrics_extracted,lyrics_language,cleaned_lyrics
1,1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,...,0.924,6e-06,0.101,0.267,77.489,4,acoustic,Youngblood thinks there's always tomorrow I mi...,en,youngblood think theres always tomorrow miss t...
2,2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,...,0.21,0.0,0.117,0.12,76.332,4,acoustic,"When the world was ending, I'd hold you in my ...",en,world end id hold arm talk place wed never wor...
3,3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,...,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,Wise men say ♪ Only fools rush in ♪ But I can'...,en,wise men say fool rush cant help fall love sha...
4,4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,...,0.469,0.0,0.0829,0.167,119.949,4,acoustic,"Loving and fighting, accusing, denying I can't...",en,love fight accuse deny cant imagine world go j...
5,5,5,01MVOl9KtVTNfFiBU9I7dc,Tyrone Wells,Days I Will Remember,Days I Will Remember,58,214240,False,0.688,...,0.289,0.0,0.189,0.666,98.017,4,acoustic,These are the days I will remember These are t...,en,days remember face need everythin change ill k...


In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.35, max_features=50) # might be good to increase max_feat to improve score

In [11]:

text_vectors = pd.DataFrame(vectorizer.fit_transform(data['cleaned_lyrics']).toarray(),
                       columns = vectorizer.get_feature_names_out())
text_vectors

Unnamed: 0,always,away,baby,back,believe,cant,cause,could,day,every,...,us,wan,wan na,want,way,well,wont,world,would,yeah
0,0.171728,0.000000,0.000000,0.000000,0.0,0.530493,0.000000,0.081974,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.137212,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.112709,0.000000,0.201592,0.0,0.000000,0.096199,0.000000,0.353658,0.000000,...,0.259149,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.503495,0.123739,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.0,0.535017,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.138845,0.000000
3,0.000000,0.061068,0.067760,0.218451,0.0,0.103337,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.197820,0.200710,0.160369,0.000000,0.000000,0.000000,0.068200,0.000000,0.059057
4,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.164198,0.166597,0.199669,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.221009,0.000000,0.000000,0.000000,0.000000,0.000000
996,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.143016,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
997,0.337226,0.000000,0.227697,0.458794,0.0,0.000000,0.000000,0.643899,0.107317,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.187299,0.000000,0.114588,0.000000,0.000000,0.000000
998,0.000000,0.000000,0.000000,0.067870,0.0,0.064211,0.064775,0.000000,0.000000,0.079687,...,0.087248,0.081947,0.083144,0.066433,0.000000,0.075348,0.084756,0.000000,0.000000,0.073393


### 3.2 Non text features

In [12]:
# We create two categories that correspond to positive mood (1) and 0(negative mood)
def cat_valence(row):
    if row >= 0.5:
        return 1
    elif row <0.5:
        return 0
    else:
        return None

# applying the function to the valence column
data['mood'] = data['valence'].apply(lambda x:cat_valence(x))
y = data['mood']

In [13]:
'''We are dropping :
- descriptive variables: 'Unnamed: 0','track_id','artists','album_name','track_name'
- valence/ mood which will be our target
- acousticness and loudness that are highly correlated to energy (which we keep)
- track_genre as it doesn't bring extra information
'''
feat_drop=['valence', 'mood', 'Unnamed: 0.1', 'Unnamed: 0','track_id','artists','album_name','track_name','loudness','acousticness', 'track_genre', 'lyrics_extracted', 'lyrics_language', 'cleaned_lyrics']

# Our features
X = data.drop(columns=feat_drop)

In [14]:
# X_sub = X.iloc[:1000,:]
# y_sub = y.iloc[:1000]

In [15]:
# Impute then scale numerical values: 
num_transformer = Pipeline([('min_max_scaler', MinMaxScaler())
])

vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# text_transformer = Pipeline(steps=[
#     ("squeez", FunctionTransformer(lambda x: x.squeeze())),
#     ("vect", CountVectorizer(**vectorizer_params)),
#     ("tfidf", TfidfTransformer()),
#     ("toarray", FunctionTransformer(lambda x: x.toarray())),
# ])
# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Parallelize "num_transformer" and "cat_transfomer"
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['popularity', 'duration_ms','danceability','energy','speechiness','instrumentalness','liveness','tempo']),
    ('cat_transformer', cat_transformer, ['explicit', 'key','mode','time_signature'])
    #,('text_transformer', text_transformer, ['cleaned_lyrics'])
])

X_transformed = preprocessor.fit_transform(X)

non_text_features =pd.DataFrame(X_transformed,columns=preprocessor.get_feature_names_out())

In [16]:
X_combined = pd.concat([non_text_features, text_vectors], axis=1)

In [17]:
X_combined.shape

(1000, 78)

## 4. Model training

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_sub, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 78), (200, 78), (800,), (200,))

In [30]:
from sklearn.linear_model import LogisticRegression

model_logR = LogisticRegression(max_iter=1000)

model_logR.fit(X_train, y_train)

In [31]:
model_logR.score(X_test, y_test)

0.72

### 4.1 SVM

In [None]:
from sklearn.svm import SVC

model_SVC = SVC(kernel='rbf')

model_SVC.fit(X_train, y_train)

model_SVC.score(X_test, y_test)

### 4.2 RNN

In [22]:
# Size of your embedding space = size of the vector representing each word
embedding_size = 200

model_rnn = Sequential()
model_rnn.add(layers.Embedding(
    input_dim=X_train.shape[1], 
    output_dim=embedding_size, # 100
    mask_zero=False, # Built-in masking layer :)
))

model_rnn.add(layers.LSTM(20))
model_rnn.add(layers.Dense(1, activation="sigmoid"))
model_rnn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 200)         15600     
                                                                 
 lstm_1 (LSTM)               (None, 20)                17680     
                                                                 
 dense_1 (Dense)             (None, 1)                 21        
                                                                 
Total params: 33,301
Trainable params: 33,301
Non-trainable params: 0
_________________________________________________________________


In [23]:
es = EarlyStopping(monitor='loss', patience=10)
model_rnn.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model_rnn.fit(X_train, y_train, epochs=100, validation_split=0.2, batch_size=32, verbose=1, callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100


<keras.callbacks.History at 0x7f6e842e8ac0>

#### ***RNN model evaluation***

In [24]:
model_rnn.evaluate(X_test,y_test)



[0.6499391198158264, 0.6549999713897705]

### 4.3 CNN

In [29]:
X_train

Unnamed: 0,num_transformer__popularity,num_transformer__duration_ms,num_transformer__danceability,num_transformer__energy,num_transformer__speechiness,num_transformer__instrumentalness,num_transformer__liveness,num_transformer__tempo,cat_transformer__explicit_False,cat_transformer__explicit_True,...,us,wan,wan na,want,way,well,wont,world,would,yeah
8,0.795699,0.227306,0.660256,0.360435,0.015794,0.000000,0.071724,0.303660,1.0,0.0,...,0.0,0.000000,0.000000,0.351266,0.000000,0.000000,0.000000,0.000000,0.055069,0.000000
865,0.000000,0.227711,0.610256,0.857304,0.124770,0.000000,0.298655,0.436410,1.0,0.0,...,0.0,0.000000,0.000000,0.073473,0.076609,0.333333,0.000000,0.000000,0.000000,0.000000
390,0.516129,0.139312,0.721795,0.273175,0.029481,0.000000,0.094184,0.462260,1.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.247231,0.000000,0.278099,0.000000,0.000000
618,0.279570,0.168962,0.775641,0.556514,0.055014,0.000000,0.089946,0.489139,1.0,0.0,...,0.0,0.000000,0.000000,0.456732,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
836,0.021505,0.199739,0.297436,0.943538,0.145301,0.000000,0.069075,0.621421,1.0,0.0,...,0.0,0.000000,0.000000,0.243225,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,0.548387,0.216160,0.698718,0.429217,0.021848,0.000255,0.100540,0.251327,1.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.125102,0.000000,0.229609,0.000000,0.300954,0.066275
12,0.623656,0.227552,0.483333,0.563700,0.011319,0.000049,0.163047,0.195223,1.0,0.0,...,0.0,0.000000,0.000000,0.101838,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
191,0.000000,0.264154,0.737179,0.494918,0.072124,0.000059,0.065261,0.259910,1.0,0.0,...,0.0,0.237914,0.241388,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
312,0.602151,0.086616,0.656410,0.330664,0.042116,0.000003,0.093124,0.686135,1.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.435210,0.157803,0.000000,0.000000,0.000000,0.153707


In [28]:
embedding_size = 200
model_cnn = Sequential([
    layers.Embedding(input_dim=X_train.shape[1], output_dim=embedding_size, mask_zero=False),
    layers.Conv1D(20, kernel_size=3),
    layers.Flatten(),
    layers.Dense(1, activation="sigmoid"),
])
model_cnn.summary()

ValueError: The last dimension of the inputs to a Dense layer should be defined. Found None. Full input shape received: (None, None)

In [None]:
es = EarlyStopping(monitor='loss', patience=1)
model_cnn.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


model_cnn.fit(X_train, y_train, epochs=100, validation_split=0.2, batch_size=64, verbose=1, callbacks=[es])

In [None]:
model_cnn.evaluate(X_test,y_test)