In [1]:
import pandas as pd

In [2]:
from textblob import TextBlob

In [3]:
data = pd.read_csv('clean_song_data3.txt', encoding='MacRoman', delimiter='\t')

In [6]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text_column(data, column_name):
    # Define a function to preprocess a single text
    def preprocess_text(text):
        # remove all numbers, punctuation, and special characters.
        cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # convert all text to lower case to ensure consistency with analysis
        cleaned_text = cleaned_text.lower()
        
        # Tokenization: splitting words into individual words or tokens
        tokens = word_tokenize(cleaned_text)
        
        # remove all common words that do not hold much meaning like "the", "is", and "and"
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        negation_words = set(['not', 'no', 'never', 'none', 'nobody', 'nothing', 'neither', 'nor'])
        negated = False
        result = []
        for word in tokens:
            if word in negation_words:
                negated = not negated
            else:
                if negated:
                    word = "NOT_" + word
                result.append(word)
        tokens = result
        
        # reduce words to their root forms
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        # Join tokens back into a single string
        cleaned_text = ' '.join(tokens)
        
        return cleaned_text
    
    # Apply the preprocess_text function to the specified column
    data[column_name] = data[column_name].apply(preprocess_text)
    
    return data

# Example usage:
# Assuming data_cleaned is your DataFrame and 'lyrics' is the column containing the lyrics
data_cleaned = preprocess_text_column(data, 'lyrics')

In [7]:
data_cleaned['polarity'] = data_cleaned['lyrics'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Concatenate the polarity column with the original DataFrame
data_with_polarity = pd.concat([data, data_cleaned['polarity']], axis=1)

# Write the DataFrame with the polarity column back to the original file
with open('clean_song_data3.txt', 'w') as f:  # Assuming your file is tab-delimited
    data_with_polarity.to_csv(f, sep='\t', index=False)