In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('dataset.xlsx', header=0, index_col=0)
df.head()

# Remove irrelevant columns
song_df = df.drop(["disc", "episode", "explicit", "mode", "time_signature"], axis=1)
song_df.head()

# Identify duplicates
song_df[song_df.duplicated()].shape

# Remove duplicates and keep the first occurrence
song_df = song_df.drop_duplicates(keep='first')

# Look for missing values
print(song_df.isnull().sum())
# No Null cells

# Reset the index
song_df = song_df.reset_index(drop=True)
# Identify songs without lyrics

sum(song_df['lyrics'] == 'lyrics not available')

df_1 = song_df

id                  0
name                0
artist              0
duration            0
popularity          0
danceability        0
acousticness        0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
lyrics              0
dtype: int64


In [3]:
df_1.head()

Unnamed: 0,id,name,artist,duration,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,lyrics
0,1hA4856JVAa0qqgKg6olJf,"The Messenger - Live from Las Vegas, 2011",Linkin Park,233280,37,0.325,0.726,0.409,1.3e-05,0.188,-5.711,0.0295,109.001,0.187,6 ContributorsThe Messenger (Live from Las Veg...
1,0GgN4MhR5GKn5IcKN0e0rG,Cancer,My Chemical Romance,142973,72,0.457,0.358,0.515,0.0,0.278,-4.31,0.0261,74.984,0.222,93 ContributorsCancer Lyrics\nTurn away\nIf yo...
2,10nyNJ6zNy2YVYLrcwLccB,No Surprises,Radiohead,229120,83,0.255,0.0577,0.393,0.00361,0.113,-10.654,0.0278,76.426,0.118,129 ContributorsTranslationsРусскийDeutschEspa...
3,7lRlq939cDG4SzWOF4VAnd,I'm Not Okay (I Promise),My Chemical Romance,186480,78,0.21,0.00602,0.94,0.0,0.269,-3.427,0.123,179.722,0.255,109 ContributorsTranslationsEspañolI’m Not Oka...
4,5u2FOoFhp495GIj5BJC77J,Hold On Till May,Pierce The Veil,278586,66,0.46,0.000585,0.91,0.00155,0.0735,-3.71,0.0525,73.019,0.327,58 ContributorsHold on Till May Lyrics\nShe si...


In [4]:
# create a lambda function that deletes text before first "Lyrics" word
# exclude the word "Lyrics" from the result
delete_text_before_lyrics = lambda x: x[x.find("Lyrics") + len("Lyrics"):] if x.find("Lyrics") != -1 else x

In [5]:
# apply the lambda function to the lyrics column
df_1['lyrics'] = df_1['lyrics'].apply(delete_text_before_lyrics)

> Remove songs without lyric and select only lyrics

In [6]:
sum(df_1['lyrics'] == 'lyrics not available')

196

In [7]:
df_1 = df_1[df_1['lyrics'] != 'lyrics not available']

In [8]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 806 entries, 0 to 1001
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                806 non-null    object 
 1   name              806 non-null    object 
 2   artist            806 non-null    object 
 3   duration          806 non-null    int64  
 4   popularity        806 non-null    int64  
 5   danceability      806 non-null    float64
 6   acousticness      806 non-null    float64
 7   energy            806 non-null    float64
 8   instrumentalness  806 non-null    float64
 9   liveness          806 non-null    float64
 10  loudness          806 non-null    float64
 11  speechiness       806 non-null    float64
 12  tempo             806 non-null    float64
 13  valence           806 non-null    float64
 14  lyrics            806 non-null    object 
dtypes: float64(9), int64(2), object(4)
memory usage: 100.8+ KB


In [9]:
import pandas as pd
from transformers import pipeline
from langdetect import detect  # You may need to install the langdetect library

  from .autonotebook import tqdm as notebook_tqdm


In [10]:


# def analyze_sentiment(text, language, max_sequence_length=512):
#     if len(text) > max_sequence_length:
#         text = text[:max_sequence_length]  # Truncate or preprocess the text to fit the model's maximum sequence length

#     model_name = None  # Define model_name variable

#     if language == "en":
#         model_name = "distilbert-base-uncased"
#     elif language == "es":
#         model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
#     elif language == "fr":
#         model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

#     if model_name is not None:
#         classifier = pipeline("sentiment-analysis", model=model_name)
#         result = classifier(text)
#         return result[0]
#     else:
#         return None 


from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

def analyze_sentiment(text, language):
    model_name = None  # Define model_name variable

    if language == "en":
        model_name = "distilbert-base-uncased"
    elif language == "es":
        model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    elif language == "fr":
        model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

    if model_name is not None:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True)
        result = classifier(text, return_all_scores=True)
        return result[0]
    else:
        return None


In [11]:
# Analyze sentiment for each song and add results as a new column
df_1['Language'] = df_1['lyrics'].apply(detect)  # Detect the language

df_1 = df_1[df_1['Language'].isin(['en', 'es', 'fr'])]
df_1.Language.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['Language'] = df_1['lyrics'].apply(detect)  # Detect the language


en    661
es     87
fr     44
Name: Language, dtype: int64

In [12]:
df_1['Sentiment'] = df_1.apply(lambda row: analyze_sentiment(row['lyrics'], row['Language']), axis=1)

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_class

In [24]:
df_1.Sentiment[0]

[{'label': 'LABEL_0', 'score': 0.44453164935112},
 {'label': 'LABEL_1', 'score': 0.5554683208465576}]

In [25]:
df_1['label_1'] = df_1['Sentiment'].apply(lambda x: x[0]['label'] if x is not None and len(x) > 0 else None)
df_1['score_1'] = df_1['Sentiment'].apply(lambda x: x[0]['score'] if x is not None and len(x) > 0 else None)

df_1['label_2'] = df_1['Sentiment'].apply(lambda x: x[1]['label'] if x is not None and len(x) > 1 else None)
df_1['score_2'] = df_1['Sentiment'].apply(lambda x: x[1]['score'] if x is not None and len(x) > 1 else None)

# Drop the 'Sentiment' column if you no longer need it
df_1.drop('Sentiment', axis=1, inplace=True)


In [26]:
df_1.to_csv('hugging_face_sentiment.csv', index=False)
# df_1.to_csv('hugging_face_sentiment2.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5f065cd9-5144-44fc-bc55-723d64b92321' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>