In [14]:

import pandas as pd
import spacy
import re

In [15]:

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# First, let's check the raw data to understand its structure
with open("spotify_data.csv", 'r', encoding='utf-8') as file:
    sample_lines = [file.readline() for _ in range(5)]  # Read first 5 lines

print("Sample lines from the CSV file:")
for i, line in enumerate(sample_lines):
    print(f"Line {i+1}: {line.strip()}")

# Try different delimiters if standard parsing didn't work
potential_delimiters = [',', ';', '\t', '|']
best_delimiter = ','
max_columns = 1

for delimiter in potential_delimiters:
    try:
        temp_df = pd.read_csv("spotify_data.csv", delimiter=delimiter, on_bad_lines='error')
        if len(temp_df.columns) > max_columns:
            max_columns = len(temp_df.columns)
            best_delimiter = delimiter
            df = temp_df
    except Exception as e:
        print(f"Failed with delimiter '{delimiter}': {e}")
        continue

# Print information about the better-parsed DataFrame
print(f"\nBest delimiter found: '{best_delimiter}'")
print(f"Columns after better parsing: {df.columns.tolist()}")
print(f"Number of rows: {len(df)}")

# We know columns are: ['artist', 'song', 'text']
# So set lyrics_column manually
lyrics_column = 'text'

# Drop rows where lyrics are missing
df = df.dropna(subset=[lyrics_column])

# Count songs and process lyrics
num_songs = len(df)
print(f"\nTotal number of songs in the dataset: {num_songs}")

def clean_text(text):
    """Clean text by removing punctuation and normalizing whitespace"""
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # normalize whitespace
    return text

def spacy_preprocess(text):
    """Process text with spaCy to get tokens and lemmas without stopwords"""
    text = clean_text(text)
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return tokens, lemmas

# Example usage
if len(df) > 0:
    sample_text = df[lyrics_column].iloc[0]
    print(f"\nSample text from first song (first 100 chars): {sample_text[:100]}...")
    
    tokens, lemmas = spacy_preprocess(sample_text)
    print(f"Number of tokens (excluding stopwords): {len(tokens)}")
    print(f"First 10 tokens: {tokens[:10]}")


Sample lines from the CSV file:
Line 1: artist,song,text
Line 2: ABBA,She's My Kind Of Girl,"Look at her face, it's a wonderful face
Line 3: And it means something special to me
Line 4: Look at the way that she smiles when she sees me
Line 5: How lucky can one fellow be?
Failed with delimiter ';': Error tokenizing data. C error: EOF inside string starting at row 2370067
Failed with delimiter '	': Error tokenizing data. C error: EOF inside string starting at row 2370067
Failed with delimiter '|': Error tokenizing data. C error: EOF inside string starting at row 2370067

Best delimiter found: ','
Columns after better parsing: ['artist', 'song', 'text']
Number of rows: 57650

Total number of songs in the dataset: 57650

Sample text from first song (first 100 chars): Look at her face, it's a wonderful face  
And it means something special to me  
Look at the way t...
Number of tokens (excluding stopwords): 48
First 10 tokens: ['look', 'face', 'wonderful', 'face', 'means', 'special', 'look'