In [36]:
%pip install deep_translator
%pip install nltk
%pip install sentence_transformers
%pip install textblob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from textblob import TextBlob
import duckdb
import pandas as pd
import re
import unicodedata
import time
from deep_translator import GoogleTranslator



Collecting sentence_transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Downloading torch-2.9.1-cp310-cp310-win_amd64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence_transformers)
  Downloading huggingface_hub-1.1.2-py3-none-any.whl.metadata (13 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub>=0.20.0->sentence_transformers)
  Downloading hf_xet-1.2.0-cp37-abi3-win_amd64.whl.metadata (5.0 kB)
Collecting shellingham (from huggingface-hub>=0.20.0->sentence_transformers)
  Downloading shellingham-1.5.4-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting typer-slim (from huggingface-hub>=0.20.0->sentence_transformers)
  Downloading typer_slim-0.20.0-py3-none-any.whl.metadata (16 kB)
Collecting sympy>=1.13.3 (from torch>=1.11.0->sente

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
lyrics = pd.read_parquet("../data/song_lyrics.parquet")
print(lyrics.shape)
print(lyrics.head())


(5134856, 11)
               title  tag     artist  year   views  \
0          Killa Cam  rap    Cam'ron  2004  173166   
1         Can I Live  rap      JAY-Z  1996  468624   
2  Forgive Me Father  rap   Fabolous  2003    4743   
3       Down and Out  rap    Cam'ron  2004  144404   
4             Fly In  rap  Lil Wayne  2005   78271   

                                       features  \
0                   {"Cam\\'ron","Opera Steve"}   
1                                            {}   
2                                            {}   
3  {"Cam\\'ron","Kanye West","Syleena Johnson"}   
4                                            {}   

                                              lyrics  id language_cld3  \
0  [Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...   1            en   
1  [Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...   3            en   
2  Maybe cause I'm eatin\nAnd these bastards fien...   4            en   
3  [Produced by Kanye West and Brian Miller]\n\n[...   5  

In [23]:
# Metrics for each language counts

language_counts = duckdb.query("""
    SELECT language, COUNT(*) AS count
    FROM lyrics
    GROUP BY language
    ORDER BY count DESC
""").to_df()
print(language_counts)


   language    count
0        en  3374198
1        es   275432
2      None   226918
3        fr   189436
4        pt   167947
..      ...      ...
80       mt        5
81       uz        4
82       tg        3
83       bs        1
84       gu        1

[85 rows x 2 columns]


In [24]:
# We only care about indian dialects, chinese, japanese, and korean

lang_codes = ['hi', 'pa', 'ta', 'te', 'ml', 'bn', 'zh','ja', 'ko']

input_path = "../data/song_lyrics.parquet"
output_path = "../data/song_lyrics_asian.parquet"

duckdb.query(f"""
    COPY (
        SELECT *,
             CASE
                WHEN language IN ('hi', 'pa', 'ta', 'te', 'ml', 'bn', 'gu', 'mr') THEN 'Indian'
                WHEN language IN ('zh') THEN 'Chinese'
                WHEN language = 'ja' THEN 'Japanese'
                WHEN language = 'ko' THEN 'Korean'
                ELSE 'Other'
             END AS region_group
        FROM '{input_path}'
        WHERE language IN {lang_codes}
    )
    TO '{output_path}' (FORMAT PARQUET);
""")


asian_language_counts = duckdb.query(f"""
    SELECT language, COUNT(*) AS count
    FROM '{output_path}'
    GROUP BY language
    ORDER BY count DESC
""").to_df()
print(asian_language_counts)



  language  count
0       ja  42637
1       ko  27979
2       zh   8813
3       hi   1609
4       ta    646
5       bn    579
6       pa    131
7       te     92
8       ml     49


In [2]:
asian_lyrics = pd.read_parquet("../data/song_lyrics_asian.parquet")

In [23]:
conn = duckdb.connect(database=':memory:', read_only=False)
#Using asian songs data to extract top 100 songs from each region based on views
conn.register('lyrics_df', asian_lyrics)

#spotify kaggle dataset and loading it into a parquet file
file_path = '../data/song_metadata.parquet'
song_metadata = pd.read_parquet(file_path)

# remove duplicates, ranking on popularity
song_metadata = (
    song_metadata.sort_values('popularity', ascending=False)
             .drop_duplicates(subset=['track_name', 'artists'], keep='first')
)
print(f"Training DataFrame shape: {song_metadata.shape}")

# normalize text - remove punctuation, lowercase, remove unnecessary spaces
def normalize_text(s):
    if pd.isnull(s):
        return ''
    # Lowercase and trim spaces
    s = s.lower().strip()
    # Normalize Unicode to standard forms (e.g., full-width → half-width)
    s = unicodedata.normalize('NFKC', s)
    # Keep:
    # - English letters and digits (\w)
    # - Spaces (\s)
    # - Chinese characters (\u4e00-\u9fff, \u3400-\u4dbf)
    # - Japanese Hiragana + Katakana (\u3040-\u30ff)
    # - Hindi / Devanagari (\u0900-\u097F)
    allowed = r'[^\w\s\u4e00-\u9fff\u3400-\u4dbf\u3040-\u30ff\u0900-\u097F]'
    s = re.sub(allowed, '', s)
    # Collapse multiple spaces
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

# create a rough join key for title/artist
asian_lyrics['join_key'] = (asian_lyrics['title'].fillna('') + ' ' + asian_lyrics['artist'].fillna('')).apply(normalize_text)
song_metadata['join_key'] = (song_metadata['track_name'].fillna('') + ' ' + song_metadata['artists'].fillna('')).apply(normalize_text)

conn = duckdb.connect(database=':memory:', read_only=False)
conn.register('lyrics_data', asian_lyrics)
conn.register('train_data', song_metadata)

# joining them both together and now this is our final csv
joined_df = conn.execute("""
    SELECT
        t1.*,
        t2.popularity,
        t2.duration_ms,
        t2.acousticness,
        t2.explicit,
        t2.energy,
        t2.danceability,
        t2.key,
        t2.loudness,
        t2.mode,
        t2.speechiness,
        t2.instrumentalness,
        t2.liveness,
        t2.valence,
        t2.tempo
    FROM lyrics_data AS t1
    LEFT JOIN train_data AS t2
    ON t1.join_key = t2.join_key
    WHERE t2.popularity is NOT NULL
""").fetchdf()

conn.close()

print("First 5 rows of the joined DataFrame:")
print(joined_df.head())
print(f"Joined DataFrame shape: {joined_df.shape}")
output_file_path = '../data/asian_lyrics_w_metadata.parquet'
joined_df.to_parquet(output_file_path, engine='fastparquet', index=False)
print("Joined df size:", len(joined_df))


Training DataFrame shape: (81344, 21)
First 5 rows of the joined DataFrame:
              title   tag           artist  year   views  \
0    Lovesick Girls   pop        BLACKPINK  2020  845243   
1           FiNALLY  rock             BiSH  2019      11   
2          NO SWEET  rock             BiSH  2019       4   
3  Its OK To Be Sad   pop   (Janice Vidal)  2021    6064   
4        Next Level   pop         aespa ()  2021   92350   

                features                                             lyrics  \
0                     {}  [블랙핑크 "Lovesick Girls" 가사]\n\n[Intro: All]\nLo...   
1                     {}  [BiSH「FiNALLY」歌詞]\n\n動く日常には\n目と目を合わせることばっか\n遠い...   
2                     {}  [BiSH「NO SWEET」歌詞]\n\n始まりは突然\n遠ざかる幽霊たち\nなけなしのお...   
3  {"衛蘭 (Janice Vidal)"}  [衞蘭「It's OK To Be Sad」歌詞]\n\n[主歌一]\n你想見他 想見他\n...   
4       {"​aespa (에스파)"}  [에스파 "Next Level" 가사]\n\n[Verse 1: Karina, Gis...   

        id language_cld3 language_ft  ... energy danceability key  loudness  \
0  590935

In [20]:
# helper functions to clean and translate lyrics

def remove_square_brackets(text):
    if isinstance(text, str):
        # Remove anything inside [...] including nested cases
        return re.sub(r'\[.*?\]', '', text).strip()
    return text

joined_df['clean_lyrics'] = joined_df['lyrics'].apply(remove_square_brackets)

In [21]:
def translate_to_english(text, retries=3, delay=0.5):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    for attempt in range(retries):
        try:
            time.sleep(delay)
            return GoogleTranslator(source='auto', target='en').translate(text)
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
            time.sleep(delay * (attempt + 1))  # exponential backoff
    return text  # fallback if all retries fail

N = 50  # choose however many you want
sample_df = joined_df.groupby('language', group_keys=False).apply(lambda x: x.sample(min(len(x), N)))
print(f"Number of rows in sample_df: {len(sample_df)}")
sample_df['lyrics_translated'] = sample_df['clean_lyrics'].apply(translate_to_english)

  sample_df = joined_df.groupby('language', group_keys=False).apply(lambda x: x.sample(min(len(x), N)))


Number of rows in sample_df: 186


KeyboardInterrupt: 

In [18]:
sample_df.to_parquet("../data/asian_songs_translated_w_metadata.parquet", index=False)

In [42]:
# now, extract features from the translated lyrics
translated_lyrics = pd.read_parquet("../data/asian_songs_translated_w_metadata.parquet")

In [33]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

def extract_text_features(text):
    tokens = preprocess_text(text)
    if len(tokens) == 0:
        return {
            'word_count': 0,
            'unique_words': 0,
            'repetition_ratio': 0.0,
            'lexical_diversity': 0.0
        }
    unique = set(tokens)
    word_count = len(tokens)
    unique_words = len(unique)
    repetition_ratio = 1 - (unique_words / word_count)
    lexical_diversity = unique_words / word_count
    return {
        'word_count': word_count,
        'unique_words': unique_words,
        'repetition_ratio': repetition_ratio,
        'lexical_diversity': lexical_diversity
    }


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\22che\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\22che\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\22che\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
text_features = translated_lyrics['lyrics_translated'].apply(extract_text_features).apply(pd.Series)
translated_lyrics[['word_count', 'unique_words', 'repetition_ratio', 'lexical_diversity']] = text_features


In [44]:
# semantic features

model = SentenceTransformer('all-MiniLM-L6-v2')  # small, fast model
embeddings = model.encode(translated_lyrics['lyrics_translated'].tolist(), show_progress_bar=True)


translated_lyrics['sentiment_polarity'] = translated_lyrics['lyrics_translated'].apply(
    lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else 0
)


Batches: 100%|██████████| 6/6 [00:06<00:00,  1.06s/it]


In [46]:
translated_lyrics.to_parquet("../data/asian_songs_translated_w_metadata_lyric_features.parquet", index=False)