In [2]:
from pathlib import Path
import os

# ¿En qué carpeta estoy?
print("CWD:", os.getcwd())

# ¿Qué archivos hay en el raw?
raw = Path('../data/raw')  # sube un nivel para llegar al data/raw real
print("data/raw contiene:", [f.name for f in raw.iterdir()])


CWD: e:\CDD1\SongReccomender\SongReccomender\notebooks
data/raw contiene: ['dataset.csv', 'spotify200_daily.csv', 'spotify_global_streaming_data.csv']


In [None]:
import pandas as pd

# 1. Carga de los CSV
streaming = pd.read_csv(raw/'spotify_global_streaming_data.csv', low_memory=False)
rankings  = pd.read_csv(raw/'spotify200_daily.csv', parse_dates=['week'], low_memory=False)
features  = pd.read_csv(raw/'dataset.csv', low_memory=False)

# 2. Shapes y columnas disponibles
for df, name in [(streaming, 'streaming'),
                 (rankings,  'rankings'),
                 (features,  'features')]:
    print(f"{name.upper():>10} shape: {df.shape}")
    print(f"{name.upper():>10} columns: {list(df.columns)}\n")


In [4]:
# 3. Vista rápida de los primeros registros
print(streaming.head(3), '\n')
print(rankings.head(3),  '\n')
print(features.head(3),  '\n')

# 4. Información de tipos y valores nulos
for df, name in [(streaming, 'streaming'),
                 (rankings,  'rankings'),
                 (features,  'features')]:
    print(f"--- {name} info ---")
    df.info()                        # tipos y total no nulos
    print("Nulos por columna:\n", (df.isna().sum()/len(df)).sort_values(ascending=False).head(), "\n")


         Country        Artist                    Album      Genre  \
0        Germany  Taylor Swift  1989 (Taylor's Version)      K-pop   
1         Brazil    The Weeknd              After Hours        R&B   
2  United States   Post Malone                   Austin  Reggaeton   

   Release Year  Monthly Listeners (Millions)  Total Streams (Millions)  \
0          2019                         23.10                   3695.53   
1          2022                         60.60                   2828.16   
2          2023                         42.84                   1425.46   

   Total Hours Streamed (Millions)  Avg Stream Duration (Min) Platform Type  \
0                         14240.35                       4.28          Free   
1                         11120.44                       3.90       Premium   
2                          4177.49                       4.03          Free   

   Streams Last 30 Days (Millions)  Skip Rate (%)  
0                           118.51           2.24

In [5]:
# 5. Estadísticas descriptivas de numéricos
print("Streaming describe:\n", streaming.describe().T, "\n")
print("Rankings describe:\n",  rankings.describe().T,  "\n")
# features puede no tener numéricos; si los tiene:
print("Features describe:\n",  features.describe(include='all').T)


Streaming describe:
                                  count        mean          std      min  \
Release Year                     500.0  2020.48800     1.671959  2018.00   
Monthly Listeners (Millions)     500.0    51.04122    28.238010     1.01   
Total Streams (Millions)         500.0  2581.15408  1416.055972    53.56   
Total Hours Streamed (Millions)  500.0  8954.36888  5167.648272   184.30   
Avg Stream Duration (Min)        500.0     3.52064     0.571431     2.51   
Streams Last 30 Days (Millions)  500.0    99.04850    57.533452     2.85   
Skip Rate (%)                    500.0    20.37046    10.597202     1.16   

                                       25%       50%         75%       max  
Release Year                     2019.0000  2020.000   2022.0000   2023.00  
Monthly Listeners (Millions)       27.6725    50.825     75.0950     99.80  
Total Streams (Millions)         1337.3375  2697.355   3798.1975   4985.54  
Total Hours Streamed (Millions)  4322.0975  9053.665  12690.18