<a href="https://colab.research.google.com/github/alexngocvu/Heart-Disease-and-Lifestyle/blob/main/LU_Score_and_Regression_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import nltk

# --- 1. SETUP & DATA LOADING ---

# Install and download necessary libraries for Colab environment
# Running pip install inside the notebook is necessary
# print("Installing required libraries...")
!pip install -q pandas numpy scikit-learn nltk

# Download NLTK data (run once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

nltk.download('punkt', quiet=True)


# IMPORTANT: You need to replace this block with code to load your actual dataset.
# If your dataset is a CSV, use the following template after uploading it to Colab:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('hot100_kaggle_195808_20211106.csv')

Saving hot100_kaggle_195808_20211106.csv to hot100_kaggle_195808_20211106 (1).csv


In [8]:
display(df.head())

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,11/6/2021,1,Easy On Me,Adele,1.0,1,3
1,11/6/2021,2,Stay,The Kid LAROI & Justin Bieber,2.0,1,16
2,11/6/2021,3,Industry Baby,Lil Nas X & Jack Harlow,3.0,1,14
3,11/6/2021,4,Fancy Like,Walker Hayes,4.0,3,19
4,11/6/2021,5,Bad Habits,Ed Sheeran,5.0,2,18


In [24]:
# --- 1.3 DATA FILTERING BY YEAR (2012-2021) ---

# The column is now standardized to 'date' in Section 1.2.
if 'date' in df.columns:
    date_col = 'date'
    initial_row_count = len(df)

    try:
        # --- NEW LOGGING: Check Date Column Integrity ---
        print(f"\nDEBUG: Found date column '{date_col}'. First 5 original values:")
        # We print up to 5 unique values to minimize log clutter
        print(df[date_col].head(5).to_markdown(index=False))
        # ----------------------------------------------------

        # 1. Attempt robust year extraction (works for dates and raw year strings)
        df['year'] = pd.to_datetime(df[date_col], errors='coerce').dt.year

        # Fallback: Extract 4-digit number if datetime conversion failed completely
        if df['year'].isnull().all():
             df['year'] = df[date_col].astype(str).str.extract(r'(\d{4})').astype(float)

        # 2. Drop rows where year extraction still failed (e.g., NaN)
        df_cleaned = df.dropna(subset=['year']).copy()


        # --- NEW LOGGING: Check Year Extraction Result ---
        if df_cleaned.empty:
            print("FATAL WARNING: All rows were dropped because the 'date' column could not be parsed into a valid year. Please verify the format of the values printed above!")
            df = df_cleaned
        else:
            df_cleaned['year'] = df_cleaned['year'].astype(int)
            print(f"DEBUG: Year extraction successful. First 5 extracted years:")
            print(df_cleaned['year'].head(5).to_markdown(index=False))

            # 3. Filter for the target period (2012 to 2021 inclusive)
            df_filtered = df_cleaned[(df_cleaned['year'] >= 2012) & (df_cleaned['year'] <= 2021)].copy()

            if not df_filtered.empty:
                df = df_filtered
                print(f"Filtered dataset from {initial_row_count} rows down to {len(df)} rows (Years 2012-2021).")
            else:
                print(f"FATAL WARNING: After filtering, the dataset for 2012-2021 is empty. All rows were outside the target range.")
                df = df_filtered

    except Exception as e:
        print(f"CRITICAL ERROR: Failed during date processing. Please check the 'date' column format: {e}")
        # Proceeding with the original df in case of error, but logging the issue.

else:
    print("WARNING: The required 'date' column was not found (or renamed). Skipping time period filtering.")



DEBUG: Found date column 'date'. First 5 original values:
| date   |
|--------|


In [17]:
import requests # Added for LRCLIB scraping

In [None]:
# -*- coding: utf-8 -*-
"""
Colab Ready Python Script for Billboard Methodology Implementation

This script implements the three LU Score metrics (Explicit Density, Structural Deviation,
and Semantic Deviation), normalizes them, calculates the final LU Score, and runs
a Multiple Regression Analysis to predict commercial success metrics.

NOTE: Replace the MOCK DATA section with your actual data loading code.
"""

import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import nltk

# --- 1. SETUP & DATA LOADING ---

# Install and download necessary libraries for Colab environment
# Running pip install inside the notebook is necessary
# print("Installing required libraries...")
!pip install -q pandas numpy scikit-learn nltk

# Download NLTK data (run once)
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')

# --- MOCK DATA SECTION: REPLACE WITH YOUR KAGGLE DATA LOADING ---

# IMPORTANT: You need to replace this block with code to load your actual dataset.
# If your dataset is a CSV, use the following template after uploading it to Colab:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('your_billboard_dataset.csv')
# --------------------------------------------------------------------------------

print("Generating mock data (Please replace this with your real data loading)...")

# Mock data creation to simulate the required columns and structure
data = {
    'date': pd.to_datetime(['2018-01-01', '2019-05-15', '2020-11-20', '2021-03-10', '2017-08-25', '2016-04-12']),
    'rank': [1, 5, 20, 3, 10, 15],
    'song_name': ['Hit Song A', 'Deep Cut B', 'Viral Track C', 'Pop Anthem D', 'Indie Vibe E', 'Standard F'],
    'artist': ['Superstar 1', 'Niche Artist 2', 'Tiktok Sensation 3', 'Superstar 1', 'Indie Band 4', 'Veteran 5'],
    'last_week': [1, 7, 15, 5, 12, 18],
    'peak_rank': [1, 5, 1, 3, 8, 15],
    'weeks_on_board': [45, 12, 50, 22, 18, 10], # Commercial Success Metric (Longevity)
    'artist_popularity': [0.9, 0.3, 0.7, 0.9, 0.4, 0.6], # Mock Control Variable
    'tempo': [120, 75, 140, 130, 95, 110], # Mock Control Variable
    'lyrics': [
        "I love you, you love me, we are a happy family. Let's party all night. This is a very conventional pop song.",
        "The sky is grey and the world is slow. The darkness wraps around, the shadows grow. I don't give a damn about your rules.",
        "Oh yeah, gotta go, gotta flow, gotta make that dough. Money, cash, cars. Repetitive chorus, repetitive chorus.",
        "We broke up, but I'm fine now. I'm strong, I'm beautiful. Party all the time. Standard love song lyrics.",
        "Existential dread and the void calls back. The ephemeral nature of being is a cruel, cruel joke. Fuck the establishment.",
        "Every single day is a struggle, but we manage. Simple words are the best. Keep it clean and easy to understand."
    ]
}
df = pd.DataFrame(data)

# Ensure all text is treated as strings
df['lyrics'] = df['lyrics'].astype(str)

print(f"Loaded {len(df)} songs for analysis.")


# --- 2. LU SCORE METRIC 1: EXPLICIT DENSITY (M1) ---

# Taboo Word List (A simplified, non-exhaustive list for demonstration)
TABOO_WORDS = {'damn', 'hell', 'fuck', 'shit', 'ass', 'bitch', 'crap'}

def calculate_explicit_density(lyrics):
    """Calculates M1: Ratio of taboo words to total words."""
    # Simple preprocessing: lowercasing and tokenizing
    words = [w.lower() for w in word_tokenize(lyrics) if w.isalpha()]
    total_words = len(words)

    if total_words == 0:
        return 0

    taboo_count = sum(1 for word in words if word in TABOO_WORDS)
    return taboo_count / total_words

df['M1_Explicit_Density'] = df['lyrics'].apply(calculate_explicit_density)


# --- 3. LU SCORE METRIC 2: STRUCTURAL DEVIATION (TTR) (M2) ---

def calculate_ttr(lyrics):
    """Calculates M2: Type-Token Ratio (Unique Words / Total Words)."""
    # Simple preprocessing: lowercasing and tokenizing, keeping only alphabetic words
    words = [w.lower() for w in word_tokenize(lyrics) if w.isalpha()]
    total_words = len(words)

    if total_words == 0:
        return 0

    unique_words = len(set(words))
    return unique_words / total_words

df['M2_TTR'] = df['lyrics'].apply(calculate_ttr)


# --- 4. LU SCORE METRIC 3: SEMANTIC DEVIATION (M3) ---

# Step 1: Define a "Conventional Pop Corpus"
# In your actual research, you should manually curate 50-100 known "conventional" pop songs
# from the same era to build a robust Conventional Pop Vector.
conventional_pop_corpus = [
    "Love you, party all the time, money on my mind, driving fast, feeling good. Classic pop themes.",
    "Heartbreak, tears, I miss you, but I'm going out tonight. Dancing to forget. Very common pop topic.",
    "Everything is perfect, feeling the vibe, the sun is out, life is great. Upbeat and simple. Very conventional.",
    "I got money, I got cars, I got fame. Look at me, I'm the best. Bragging and luxury themes.",
    "We are going to dance, dance, dance, the night is young. Repeat chorus. Simple, repetitive party song.",
]

def calculate_semantic_deviation(df, pop_corpus):
    """Calculates M3: Cosine Similarity distance from a '