In [None]:
import pandas as pd
import numpy as np
import re
import string
import emoji
import contractions
from bs4 import BeautifulSoup
from collections import Counter
from functools import reduce

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from textblob import TextBlob
import spacy

from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModel
nltk.download('punkt')
nltk.download('stopwords')

In [None]:



nlp = spacy.load('en_core_web_sm')


data = pd.read_csv('data_cut.csv')


tokenized_lyrics = []
pos_tags = []
named_entities = []


for index, row in data[data['language'] == 'en'].iterrows():
    lyrics = row['lyrics']


    doc = nlp(lyrics)

    tokens = [token.text for token in doc]
    tokenized_lyrics.append(tokens)

 
    pos = [f"{token.text} ({token.pos_})" for token in doc]
    pos_tags.append(pos)


    entities = [(ent.text, ent.label_) for ent in doc.ents]
    named_entities.append(entities)


data.loc[data['language'] == 'en', 'tokenized_lyrics'] = tokenized_lyrics
data.loc[data['language'] == 'en', 'pos_tags'] = pos_tags
data.loc[data['language'] == 'en', 'named_entities'] = named_entities


data.to_csv('data_process.csv', index=False)


In [None]:



df1 = pd.read_csv('final_dataset_no_duplicates.csv')
df2 = pd.read_csv('add.csv')


concatenated_df = pd.concat([df1, df2], axis=0)


if 'Unnamed: 0' in concatenated_df.columns:
    concatenated_df = concatenated_df.drop('Unnamed: 0', axis=1)


concatenated_df.to_csv('concatenated.csv', index=False)


In [None]:



nltk.download('punkt')

df = pd.read_csv('concatenated.csv')


df['lyrics'] = df['lyrics'].fillna("")

df['sentiment'] = df['lyrics'].apply(lambda x: TextBlob(x).sentiment.polarity)


data = df['lyrics'].tolist()
train_corpus = [TaggedDocument(words=word_tokenize(text.lower()), tags=[str(i)]) for i, text in enumerate(data)]
model = Doc2Vec(vector_size=50, min_count=2, epochs=40, dm=1)


model.build_vocab(train_corpus)


model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

model.save('trained_model_example')


In [None]:


model = Doc2Vec.load('trained_model_example')

emb_df = pd.DataFrame([model.dv[str(i)] for i in range(len(data))])


fe_df = pd.concat([df, emb_df], axis=1)


print(fe_df.head())


fe_df.to_csv('data_with_vectors_and_sentiment.csv', index=False)

In [None]:



def sentiment_analysis(text, sentiment_pipeline, chunk_size=512):
    
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    
    sentiments = []
    try:
        for chunk in chunks:
            chunk_sentiment = sentiment_pipeline(chunk)[0]['label']
            sentiments.append(chunk_sentiment)
        
        
        most_common_sentiment = max(set(sentiments), key=sentiments.count)
        return most_common_sentiment
    except Exception as e:
        print(f"Error in sentiment analysis: {e}")
        return "error"


sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)


df = pd.read_csv('concatenated.csv')


def preprocess_text(text):
    
    
    text = BeautifulSoup(text, "html.parser").get_text()

    text = emoji.demojize(text)

    
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    
    patterns = [(r"(\w)\1{2,}", r"\1\1"), (r"(\w)-\1+", r"\1"), (r"(\b\w+\b)-\1+", r"\1")]
    text = reduce(lambda doc, pattern: re.sub(pattern[0], pattern[1], doc), patterns, contractions.fix(text.lower()))

    
    word_counts = Counter(word_tokenize(text))
    text = ' '.join([word if word_counts[word] <= 10 else '' for word in text.split()])

    
    symbols = set(string.punctuation).union({"`", "“", "”", "‘", "’", "–", "—"})
    text = " ".join(word for word in word_tokenize(text) if word not in symbols)

    
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    
    doc = nlp(" ".join(tokens))
    final_text = " ".join([token.lemma_ for token in doc])

    
    return text

df['lyrics'] = df['lyrics'].fillna("").apply(preprocess_text)


df['sentiment'] = df['lyrics'].apply(lambda x: sentiment_analysis(x, sentiment_pipeline))


embedding_model_name = "xlm-roberta-base"
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)


def get_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()


df['embeddings'] = df['lyrics'].apply(lambda x: get_embeddings(x, embedding_tokenizer, embedding_model))


def flatten_embeddings(embeddings):
    return [np.array(embedding).flatten() for embedding in embeddings]


df['flattened_embeddings'] = flatten_embeddings(df['embeddings'].tolist())


df.to_csv('enhanced_data_with_sentiment_and_embeddings.csv', index=False)


In [None]:


df = pd.read_csv('enhanced_data_with_sentiment_and_embeddings.csv')


def convert_string_to_list(string):
    try:
        
        return np.fromstring(string.strip('[]'), sep=' ')
    except ValueError:
        
        return None


df['flattened_embeddings'] = df['flattened_embeddings'].apply(convert_string_to_list)


df = df.dropna(subset=['flattened_embeddings'])


embeddings_df = pd.DataFrame(df['flattened_embeddings'].tolist())


embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]


final_df = pd.concat([df, embeddings_df], axis=1)


final_df.to_csv('enhanced_data_with_expanded_flattened_embeddings.csv', index=False)
