In [None]:
# Spotify 2024 Analysis - Initial Exploration

# This notebook covers:
# 1. Loading the data
# 2. Initial exploration
# 3. Basic visualizations

In [2]:
# 1. Import Libraries

import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from pathlib import Path

print("Libraries imported successfully")

# Set up visualization style
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10
print("Visualization style set")

# See all the columns and rows

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)
print("Format Columns and Rows set")

Libraries imported successfully
Visualization style set
Format Columns and Rows set


In [3]:
# 2. Load Data   

# 2.1. Finds the project root by looking for your data file

def find_project_root(marker_file="Most Streamed Spotify Songs 2024.csv"):
    current_path = Path(os.getcwd())
    
    # Search upward through parent folders
    for parent in [current_path] + list(current_path.parents):
        # Check for Data folder with target file
        data_path = parent / "Data" / marker_file
        if data_path.exists():
            print(f" Found project root: {parent}")
            return parent
        
        # Alternative: check for file directly in parent
        if (parent / marker_file).exists():
            print(f" Found project root: {parent}")
            return parent
    
    # If not found
    print(" Could not locate project root containing your data file")
    print(f"Searching for: {marker_file}")
    print(f"Search path: {current_path} -> {current_path.parents[-1]}")
    return None

# Usage:
project_root = find_project_root()
if project_root:
    os.chdir(project_root)  # Change to project root
    print(f" Working directory set to: {project_root}")

 Found project root: c:\ARCHIVOS\DOCUMENTS\Mary\Alvaro\Portfolio\Python\Project 1
 Working directory set to: c:\ARCHIVOS\DOCUMENTS\Mary\Alvaro\Portfolio\Python\Project 1


In [6]:
# 2.2. Load Data

def load_data():

    encodings = ['latin-1', 'utf-8', 'utf-8-sig', 'iso-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            data_path = Path("Data") / "Most Streamed Spotify Songs 2024.csv"
            if data_path.exists():
                df = pd.read_csv(data_path, encoding=encoding)
                print(f" Successfully loaded with {encoding} encoding")
                return df
        except Exception as e:
            print(f" Failed with {encoding}: {str(e)[:100]}...")
    
    print("\n Try these solutions:")
    print("1. Verify the file exists in the Data folder")
    print("2. Check for file corruption")
    print(f"3. Current directory: {Path.cwd()}")
    return None

print("=== Starting Data Load ===")
project_root = find_project_root()
if project_root:
    os.chdir(project_root)
    
    df = load_data()  
    
    if df is not None:
        print("\n=== Data Loaded Successfully ===")
        print(f"Shape: {df.shape}")
        print("\nFirst 5 rows:")
        display(df.head(5))
    else:
        print("\n Failed to load DataFrame")
else:
    print("\n Cannot proceed without project root")

=== Starting Data Load ===
 Found project root: c:\ARCHIVOS\DOCUMENTS\Mary\Alvaro\Portfolio\Python\Project 1
 Successfully loaded with latin-1 encoding

=== Data Loaded Successfully ===
Shape: (4600, 29)

First 5 rows:


Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Apple Music Playlist Count,AirPlay Spins,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725.4,390470936,30716,196631588,92.0,84274754,1713126,5767700,651565900.0,5332281936.0,150597040,210.0,40975,684,62.0,17598718,114.0,18004655,22931,4818457.0,2669262,,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545.9,323703884,28113,174597137,92.0,116347040,3486739,674700,35223547.0,208339025.0,156380351,188.0,40778,3,67.0,10422430,111.0,7780028,28444,6623075.0,1118279,,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538.4,601309283,54331,211607669,92.0,122599116,2228730,3025400,275154237.0,3369120610.0,373784955,190.0,74333,536,136.0,36321847,172.0,5022621,5639,7208651.0,5285340,,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444.9,2031280633,269802,136569078,85.0,1096100899,10629796,7189811,1078757968.0,14603725994.0,3351188582,394.0,1474799,2182,264.0,24684248,210.0,190260277,203384,,11822942,,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423.3,107034922,7223,151469874,88.0,77373957,3670188,16400,,,112763851,182.0,12185,1,82.0,17660624,105.0,4493884,7006,207179.0,457017,,1


In [5]:
# 3. Initial Exploration

if df is not None:
    print("\n=== Data Exploration ===")
    
    # 3.1 Basic info
    print("\n1. Basic Info:\n")
    print(f"- Total songs: {len(df):,}")
    print(f"- Columns: {list(df.columns)}")
    
    # 3.2 Data types
    print("\n2. Data Types:\n")
    display(df.dtypes.to_frame('Data Type'))
    
    # 3.3 Missing values
    missing_data = df.isnull().sum()
    total_missing = missing_data.sum()
    print("\n3. Missing values:\n") 
    if total_missing == 0:
        print("No missing values found!")
    else:
        print(missing_data[missing_data > 0].sort_values(ascending=False))

    # 3.4 Duplicates
    duplicate_count = df.duplicated().sum()
    print(f"\n4. Duplicate rows: {duplicate_count}")
    df[df.duplicated(keep=False)]

    # 3.5 Unique Values    
    print("\n5. Unique Values per column:\n")
    unique_counts = (df.nunique()
                    .reset_index()
                    .rename(columns={'index': 'Column', 0: 'Unique Values'})
                    .sort_values('Unique Values', ascending=False))
    unique_counts['% Unique'] = (unique_counts['Unique Values'] / len(df) * 100).round(1)
    display(unique_counts)

    # 3.6 Quick stats
    print("\n6. Quick Stats:\n")
    display(df.describe().transpose())


=== Data Exploration ===

1. Basic Info:

- Total songs: 4,600
- Columns: ['Track', 'Album Name', 'Artist', 'Release Date', 'ISRC', 'All Time Rank', 'Track Score', 'Spotify Streams', 'Spotify Playlist Count', 'Spotify Playlist Reach', 'Spotify Popularity', 'YouTube Views', 'YouTube Likes', 'TikTok Posts', 'TikTok Likes', 'TikTok Views', 'YouTube Playlist Reach', 'Apple Music Playlist Count', 'AirPlay Spins', 'SiriusXM Spins', 'Deezer Playlist Count', 'Deezer Playlist Reach', 'Amazon Playlist Count', 'Pandora Streams', 'Pandora Track Stations', 'Soundcloud Streams', 'Shazam Counts', 'TIDAL Popularity', 'Explicit Track']

2. Data Types:



Unnamed: 0,Data Type
Track,object
Album Name,object
Artist,object
Release Date,object
ISRC,object
All Time Rank,object
Track Score,float64
Spotify Streams,object
Spotify Playlist Count,object
Spotify Playlist Reach,object



3. Missing values:

TIDAL Popularity              4600
Soundcloud Streams            3333
SiriusXM Spins                2123
Pandora Track Stations        1268
TikTok Posts                  1173
Pandora Streams               1106
Amazon Playlist Count         1055
YouTube Playlist Reach        1009
TikTok Views                   981
TikTok Likes                   980
Deezer Playlist Reach          928
Deezer Playlist Count          921
Spotify Popularity             804
Shazam Counts                  577
Apple Music Playlist Count     561
AirPlay Spins                  498
YouTube Likes                  315
YouTube Views                  308
Spotify Streams                113
Spotify Playlist Reach          72
Spotify Playlist Count          70
Artist                           5
dtype: int64

4. Duplicate rows: 2

5. Unique Values per column:



Unnamed: 0,Column,Unique Values,% Unique
4,ISRC,4598,100.0
5,All Time Rank,4577,99.5
9,Spotify Playlist Reach,4478,97.3
7,Spotify Streams,4425,96.2
0,Track,4370,95.0
11,YouTube Views,4290,93.3
12,YouTube Likes,4283,93.1
8,Spotify Playlist Count,4207,91.5
1,Album Name,4005,87.1
26,Shazam Counts,4002,87.0



6. Quick Stats:



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Track Score,4600.0,41.84,38.54,19.4,23.3,29.9,44.42,725.4
Spotify Popularity,3796.0,63.5,16.19,1.0,61.0,67.0,73.0,96.0
Apple Music Playlist Count,4039.0,54.6,71.61,1.0,10.0,28.0,70.0,859.0
Deezer Playlist Count,3679.0,32.31,54.27,1.0,5.0,15.0,37.0,632.0
Amazon Playlist Count,3545.0,25.35,25.99,1.0,8.0,17.0,34.0,210.0
TIDAL Popularity,0.0,,,,,,,
Explicit Track,4600.0,0.36,0.48,0.0,0.0,0.0,1.0,1.0


In [None]:
# 4. Basic Visualizations

def save_visualization(plot_name="artists_popularity.png"):

    # 1. Create folder explicitly 
    vis_dir = Path("Visualizations")
    
    # 2. Save with validation
    try:
        vis_dir.mkdir(exist_ok=True)  # Only creates if needed
        save_path = vis_dir / plot_name # Generate full save path
        plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
        print(f" Visualization saved to:\n{save_path.resolve()}")
        return True
    except Exception as e:
        print(f" Failed to save: {str(e)}\nAttempted path: {vis_dir.resolve()/plot_name}")
        return False

# Plot the Visualizations
if df is not None:
    plt.figure(figsize=(18, 7))
    
    # Plot 1: Top Artists
    plt.subplot(1, 2, 1)
    top_artists = df['Artist'].value_counts().head(10).sort_values()
    bars = top_artists.plot(kind='barh', color='#1DB954', edgecolor='black')
    plt.title("Top 10 Artists by Song Count", fontsize=14, pad=15)
    plt.xlabel("Number of Songs", fontsize=12)
    # Value Labels
    max_songs = top_artists.max()
    for i, v in enumerate(top_artists):
        bars.text(v + max_songs*0.02, i, f"{v:,}", va='center', fontsize=11)
    
    # Plot 2: Spotify Popularity 
    plt.subplot(1, 2, 2)
    n, bins, patches = plt.hist(df['Spotify Popularity'], bins=30,
                                color='#191414', edgecolor='#1DB954')
    
    # Reference lines with auto-positioned labels
    stats = {
        'Mean': (df['Spotify Popularity'].mean(), '#1DB954', '--'),
        'Median': (df['Spotify Popularity'].median(), '#FFA500', ':')
    }
    
    for i, (name, (value, color, linestyle)) in enumerate(stats.items()):
        plt.axvline(value, color=color, linestyle=linestyle, 
                   linewidth=2, ymin=0, ymax=1)
        plt.text(value + 1, plt.ylim()[1]*(0.95 - i*0.1),  # Stacked labels
                f"{name}: {value:.1f}", color=color, fontsize=12,
                bbox=dict(facecolor='white', alpha=0.7))

    # Axis formatting
    plt.title("Spotify Popularity Distribution\n(0-100 scale)", fontsize=14, pad=15)
    plt.xlabel("Popularity Score", fontsize=12)
    plt.ylabel("Number of Songs", fontsize=12)
    plt.xlim(0, 100)  # Fixed x-axis range
    plt.ylim(0, None)  # Force y-axis to start at 0

    # Save and Show
    plt.tight_layout(pad=3.0)
    if save_visualization():  
        plt.show()
    else:
        plt.close()

In [None]:
# 5. Save a copy of the original data

def save_data_backup(df):
    os.makedirs("Data", exist_ok=True)
    save_path = Path("Data") / "spotify_raw_backup.csv"
    
    try:
        df.to_csv(save_path, index=False)
        print(f" Data saved to: {save_path.resolve()}")
    except Exception as e:
        print(f" Save failed: {str(e)}")

save_data_backup(df)

In [None]:
# 6. Next Steps
print("\n Next Steps:")
print("1. Check the Visualizations/ folder for your plots")
print("2. Review the exploration results above")
print("3. In the next notebook we'll clean the data")