# Preprocessing steps for Data Analysis

## 1. Imports

In [1]:
# GENERAL
import pandas as pd
import numpy as np

# Sklearn 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer

# Language processing
import nltk
from langdetect import detect
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping

2023-06-16 12:01:55.797777: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-16 12:01:55.872187: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-16 12:01:55.874182: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 2. Dataset

In [2]:
# df1 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s1.csv')
# df2 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s2.csv')
# df3 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s3.csv')
# df4 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s4.csv')
# df5 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s5.csv')
# df6 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s6.csv')
# df7 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s7.csv')
# df8 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s8.csv')
# df9 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s9.csv')
# df10 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s10.csv')
# df11 = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_s11.csv')
# frames = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11]
# data = pd.concat(frames)
data = pd.read_csv('/home/anais/code/anaisdangeot/mood_detector/raw_data/dataset_enriched_total.csv')
data.columns

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'track_id', 'artists',
       'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'track_genre', 'lyrics_extracted', 'lyrics_language'],
      dtype='object')

In [3]:
data = data[data['lyrics_extracted']!='999']
data.shape

(22833, 25)

In [4]:
data = data[data['lyrics_language']=='en'] # we will have to decide whether to translate non english or use only english songs

In [5]:
data.shape

(22833, 25)

## 3. Preprocessing steps and feature engineering

In [6]:
# TEXT PREPROCESSING
import unicodedata
import re 

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    # function to remove accented characters
    def remove_accented_chars(txt):
        new_text = unicodedata.normalize('NFKD', txt).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return new_text
    sentence = remove_accented_chars(sentence)
    
    tokenized_sentence = nltk.word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]
    
    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)
    
    return cleaned_sentence

In [7]:
data['cleaned_lyrics'] = data["lyrics_extracted"].apply(cleaning) # add signs (musical note) removal
data.head()
data.info

<bound method DataFrame.info of        Unnamed: 0.2  Unnamed: 0.1  Unnamed: 0                track_id  \
0                 1             1           1  4qPNDBW1i3p13qLCt0Ki3A   
1                 2             2           2  1iJBSr7s7jYXzM8EGcbK5b   
2                 3             3           3  6lfxq3CG4xtTiEg7opyCyx   
3                 4             4           4  5vjLSffimiIP26QG5WcN2K   
4                 5             5           5  01MVOl9KtVTNfFiBU9I7dc   
...             ...           ...         ...                     ...   
22828          3987        113987      113987  4jDhzTYkEG5GloIWwVeVkc   
22829          3988        113988      113988  6PM55W7WiUmHVPdUebJP55   
22830          3991        113991      113991  0CE0Y6GM75cbrqao8EOAlW   
22831          3992        113992      113992  3FjOBB4EyIXHYUtSgrIdY9   
22832          3993        113993      113993  4OkMK49i3NApR1KsAIsTf6   

                      artists  \
0                Ben Woodward   
1      Ingrid Michaelson;

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22833 entries, 0 to 22832
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0.2      22833 non-null  int64  
 1   Unnamed: 0.1      22833 non-null  int64  
 2   Unnamed: 0        22833 non-null  int64  
 3   track_id          22833 non-null  object 
 4   artists           22833 non-null  object 
 5   album_name        22833 non-null  object 
 6   track_name        22833 non-null  object 
 7   popularity        22833 non-null  int64  
 8   duration_ms       22833 non-null  int64  
 9   explicit          22833 non-null  bool   
 10  danceability      22833 non-null  float64
 11  energy            22833 non-null  float64
 12  key               22833 non-null  int64  
 13  loudness          22833 non-null  float64
 14  mode              22833 non-null  int64  
 15  speechiness       22833 non-null  float64
 16  acousticness      22833 non-null  float6

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.35, max_features=50) # might be good to increase max_feat to improve score

In [11]:

text_vectors = pd.DataFrame(vectorizer.fit_transform(data['cleaned_lyrics']).toarray(),
                       columns = vectorizer.get_feature_names_out())
text_vectors

Unnamed: 0,always,away,baby,back,believe,cant,cause,could,day,every,...,us,wan,wan na,want,way,well,wont,world,would,yeah
0,0.171728,0.000000,0.000000,0.000000,0.0,0.530493,0.000000,0.081974,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.137212,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.112709,0.000000,0.201592,0.0,0.000000,0.096199,0.000000,0.353658,0.000000,...,0.259149,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.503495,0.123739,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.0,0.535017,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.138845,0.000000
3,0.000000,0.061068,0.067760,0.218451,0.0,0.103337,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.197820,0.200710,0.160369,0.000000,0.000000,0.000000,0.068200,0.000000,0.059057
4,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.164198,0.166597,0.199669,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.221009,0.000000,0.000000,0.000000,0.000000,0.000000
996,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.143016,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
997,0.337226,0.000000,0.227697,0.458794,0.0,0.000000,0.000000,0.643899,0.107317,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.187299,0.000000,0.114588,0.000000,0.000000,0.000000
998,0.000000,0.000000,0.000000,0.067870,0.0,0.064211,0.064775,0.000000,0.000000,0.079687,...,0.087248,0.081947,0.083144,0.066433,0.000000,0.075348,0.084756,0.000000,0.000000,0.073393


### 3.2 Non text features

In [12]:
# We create two categories that correspond to positive mood (1) and 0(negative mood)
# def cat_valence(row):
#     if row >= 0.5:
#         return 1
#     elif row <0.5:
#         return 0
#     else:
#         return None

# # applying the function to the valence column
# y= data['mood'] = data['valence'].apply(lambda x:cat_valence(x))
y = data['valence']

In [13]:
'''We are dropping :
- descriptive variables: 'Unnamed: 0','track_id','artists','album_name','track_name'
- valence/ mood which will be our target
- acousticness and loudness that are highly correlated to energy (which we keep)
- track_genre as it doesn't bring extra information
'''
feat_drop=['valence', 'Unnamed: 0.1', 'Unnamed: 0','track_id','artists','album_name','track_name','loudness','acousticness', 'track_genre', 'lyrics_extracted', 'lyrics_language', 'cleaned_lyrics']

# Our features
X = data.drop(columns=feat_drop)

In [14]:
# X_sub = X.iloc[:1000,:]
# y_sub = y.iloc[:1000]

In [15]:
# Impute then scale numerical values: 
num_transformer = Pipeline([('min_max_scaler', MinMaxScaler())
])

vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# text_transformer = Pipeline(steps=[
#     ("squeez", FunctionTransformer(lambda x: x.squeeze())),
#     ("vect", CountVectorizer(**vectorizer_params)),
#     ("tfidf", TfidfTransformer()),
#     ("toarray", FunctionTransformer(lambda x: x.toarray())),
# ])
# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Parallelize "num_transformer" and "cat_transfomer"
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['popularity', 'duration_ms','danceability','energy','speechiness','instrumentalness','liveness','tempo']),
    ('cat_transformer', cat_transformer, ['explicit', 'key','mode','time_signature'])
    #,('text_transformer', text_transformer, ['cleaned_lyrics'])
])

X_transformed = preprocessor.fit_transform(X)

non_text_features =pd.DataFrame(X_transformed,columns=preprocessor.get_feature_names_out())

In [16]:
X_combined = pd.concat([non_text_features, text_vectors], axis=1)

In [17]:
X_combined.shape

(1000, 78)

## 4. Model training

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_sub, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 78), (200, 78), (800,), (200,))