# Data Analysis Steps (to be packaged)

## 1. Imports

In [1]:
# GENERAL
import pandas as pd
import numpy as np

# Sklearn 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer

# Language processing
import nltk
from langdetect import detect
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Tensorflow
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras import layers, Sequential
# from tensorflow.keras.callbacks import EarlyStopping

## 2. Dataset

In [2]:
data = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_total.csv')
data.shape

(22833, 25)

In [3]:
data.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,lyrics_extracted,lyrics_language
0,1,1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,...,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,Youngblood thinks there's always tomorrow I mi...,en
1,2,2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,...,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,"When the world was ending, I'd hold you in my ...",en
2,3,3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,...,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,Wise men say ♪ Only fools rush in ♪ But I can'...,en
3,4,4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,...,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,"Loving and fighting, accusing, denying I can't...",en
4,5,5,5,01MVOl9KtVTNfFiBU9I7dc,Tyrone Wells,Days I Will Remember,Days I Will Remember,58,214240,False,...,0.105,0.289,0.0,0.189,0.666,98.017,4,acoustic,These are the days I will remember These are t...,en


In [4]:
track_name = pd.Series(data['artists'])
sum(track_name.isin(['Peabo Bryson'])) 

0

## 3. Preprocessing steps and feature engineering

### 3.1 Text features preprocessing

In [5]:
# TEXT PREPROCESSING
import unicodedata
import re 

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    # function to remove accented characters
    def remove_accented_chars(txt):
        new_text = unicodedata.normalize('NFKD', txt).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return new_text
    sentence = remove_accented_chars(sentence)
    
    tokenized_sentence = nltk.word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]
    
    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)
    
    return cleaned_sentence

In [6]:
data['cleaned_lyrics'] = data["lyrics_extracted"].apply(cleaning) # add signs (musical note) removal
data.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,lyrics_extracted,lyrics_language,cleaned_lyrics
0,1,1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,...,0.924,6e-06,0.101,0.267,77.489,4,acoustic,Youngblood thinks there's always tomorrow I mi...,en,youngblood think theres always tomorrow miss t...
1,2,2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,...,0.21,0.0,0.117,0.12,76.332,4,acoustic,"When the world was ending, I'd hold you in my ...",en,world end id hold arm talk place wed never wor...
2,3,3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,...,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,Wise men say ♪ Only fools rush in ♪ But I can'...,en,wise men say fool rush cant help fall love sha...
3,4,4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,...,0.469,0.0,0.0829,0.167,119.949,4,acoustic,"Loving and fighting, accusing, denying I can't...",en,love fight accuse deny cant imagine world go j...
4,5,5,5,01MVOl9KtVTNfFiBU9I7dc,Tyrone Wells,Days I Will Remember,Days I Will Remember,58,214240,False,...,0.289,0.0,0.189,0.666,98.017,4,acoustic,These are the days I will remember These are t...,en,days remember face need everythin change ill k...


In [9]:
import pickle
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.35, max_features=300) 
vectorizer.fit_transform(data['cleaned_lyrics'])
pickle.dump(vectorizer, open('/home/anais/code/anaisdangeot/mood_detector/code_mood/ml_logic/model_pipeline/vectorizer.pickle', 'wb'))

In [10]:

text_vectors = pd.DataFrame(vectorizer.fit_transform(data['cleaned_lyrics']).toarray(),
                       columns = vectorizer.get_feature_names_out())
text_vectors

Unnamed: 0,ah,aint,alive,alone,alright,always,another,arm,around,ask,...,ya,yeah,yeah yeah,year,yes,youll,young,youre,youve,yuh
0,0.0,0.0,0.000000,0.000000,0.000000,0.116741,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.000000,0.000000,0.158021,0.000000,0.135809,0.054313,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.000000,0.066801,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.046896,0.000000,0.0,0.000000,0.000000,0.0,0.044968,0.0,0.0
4,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22828,0.0,0.0,0.054364,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.121688,0.103431,0.0,0.000000,0.000000,0.0,0.029171,0.0,0.0
22829,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.033422,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.064147,0.000000,0.0,0.123163,0.0,0.0
22830,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.101214,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0
22831,0.0,0.0,0.000000,0.024422,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.034289,0.029145,0.0,0.000000,0.000000,0.0,0.016440,0.0,0.0


In [11]:
text_vectors.shape

(22833, 300)

### 3.2 Non text features

In [12]:
# We create two categories that correspond to positive mood (1) and 0(negative mood)
def cat_valence(row):
    if row >= 0.5:
        return 1
    elif row <0.5:
        return 0
    else:
        return None

# applying the function to the valence column
data['mood'] = data['valence'].apply(lambda x:cat_valence(x))
y = data['mood']

In [13]:
'''We are dropping :
- descriptive variables: 'Unnamed: 0','track_id','artists','album_name','track_name'
- valence/ mood which will be our target
- acousticness and loudness that are highly correlated to energy (which we keep)
- track_genre as it doesn't bring extra information
'''
feat_drop=['valence', 'mood', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0','track_id','artists','album_name','track_name','loudness','acousticness', 'track_genre', 'lyrics_extracted', 'lyrics_language', 'cleaned_lyrics']

# Our features
X = data.drop(columns=feat_drop)

In [28]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22833 entries, 0 to 22832
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        22833 non-null  int64  
 1   duration_ms       22833 non-null  int64  
 2   explicit          22833 non-null  bool   
 3   danceability      22833 non-null  float64
 4   energy            22833 non-null  float64
 5   key               22833 non-null  int64  
 6   mode              22833 non-null  int64  
 7   speechiness       22833 non-null  float64
 8   instrumentalness  22833 non-null  float64
 9   liveness          22833 non-null  float64
 10  tempo             22833 non-null  float64
 11  time_signature    22833 non-null  int64  
dtypes: bool(1), float64(6), int64(5)
memory usage: 1.9 MB


In [29]:
# Impute then scale numerical values: 
num_transformer = Pipeline([('min_max_scaler', MinMaxScaler())
])

cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Parallelize "num_transformer" and "cat_transfomer"
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['popularity', 'duration_ms','danceability','energy','speechiness','instrumentalness','liveness','tempo']),
    ('cat_transformer', cat_transformer, ['explicit', 'key','mode','time_signature'])
])

X_transformed = preprocessor.fit_transform(X)
pickle.dump(preprocessor, open('/home/anais/code/anaisdangeot/mood_detector/code_mood/ml_logic/model_pipeline/pipeline.pickle', 'wb'))

non_text_features =pd.DataFrame(X_transformed,columns=preprocessor.get_feature_names_out())

In [15]:
X_combined = pd.concat([non_text_features, text_vectors], axis=1)

In [16]:
X_combined.shape

(22833, 329)

In [17]:
X_combined.head()

Unnamed: 0,num_transformer__popularity,num_transformer__duration_ms,num_transformer__danceability,num_transformer__energy,num_transformer__speechiness,num_transformer__instrumentalness,num_transformer__liveness,num_transformer__tempo,cat_transformer__explicit_False,cat_transformer__explicit_True,...,ya,yeah,yeah yeah,year,yes,youll,young,youre,youve,yuh
0,0.55,0.090293,0.429448,0.165983,0.080231,6e-06,0.090725,0.318397,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.57,0.134391,0.447853,0.358987,0.05857,0.0,0.106962,0.313643,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.71,0.127985,0.271984,0.059581,0.03817,7.1e-05,0.122184,0.746758,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.82,0.125766,0.631902,0.442989,0.05531,0.0,0.072356,0.492863,1.0,0.0,...,0.0,0.046896,0.0,0.0,0.0,0.0,0.0,0.044968,0.0,0.0
4,0.58,0.136851,0.703476,0.48099,0.11041,0.0,0.180028,0.402746,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
X_combined.columns[20:]

Index(['cat_transformer__key_10', 'cat_transformer__key_11',
       'cat_transformer__mode_0', 'cat_transformer__mode_1',
       'cat_transformer__time_signature_0',
       'cat_transformer__time_signature_1',
       'cat_transformer__time_signature_3',
       'cat_transformer__time_signature_4',
       'cat_transformer__time_signature_5', 'ah',
       ...
       'ya', 'yeah', 'yeah yeah', 'year', 'yes', 'youll', 'young', 'youre',
       'youve', 'yuh'],
      dtype='object', length=309)

## 4. Model training

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18266, 329), (4567, 329), (18266,), (4567,))

In [21]:
from sklearn.linear_model import LogisticRegression

model_logR = LogisticRegression(max_iter=1000)

model_logR.fit(X_train, y_train)

In [22]:
model_logR.score(X_test, y_test)

0.7449091307203853

### 4.1 SVM

In [20]:
# from sklearn.model_selection import GridSearchCV

# # Instantiate model
# model = SVC()

# # Hyperparameter Grid
# grid = {
#     'kernel': ['rbf', 'linear'], 
#     'C': [0.5, 1, 5]
# }

# # Instantiate Grid Search
# search = GridSearchCV(
#     model,
#     grid, 
#     scoring = 'accuracy',
#     cv = 5,
#     n_jobs=-1 # parallelize computation
# ) 

# # Fit data to Grid Search
# search.fit(X_train, y_train);

In [21]:
# # Best score
# search.best_score_

# # Best Params
# search.best_params_

# # Best estimator
# search.best_estimator_

In [23]:
from sklearn.svm import SVC

model_SVC = SVC(C=10, kernel='rbf', gamma=1)

model_SVC.fit(X_train, y_train)

model_SVC.score(X_test, y_test)

0.8265820013137727

#### ***Saving model***

In [24]:
import time
import pickle
#from sklearn.externals import joblib

timestamp = time.strftime("%Y%m%d-%H%M%S")

pickle.dump(model_SVC, open('/home/anais/code/anaisdangeot/mood_detector/code_mood/ml_logic/model_pipeline/modelSVC_bestparams_saved.h5', 'wb'))
