In [68]:
import pandas as pd
import numpy as np

nome_real_do_arquivo = 'spotify-2023.csv' 
caminho_do_arquivo = f'../data/raw/{nome_real_do_arquivo}'


df = pd.read_csv(
    caminho_do_arquivo,
    sep=',',           
    encoding='latin-1',
    low_memory=False
)
#Transformação de int para float nas porcentagens
cols_pct = [
    'danceability_%',
    'valence_%',
    'energy_%',
    'acousticness_%',
    'instrumentalness_%',
    'liveness_%',
    'speechiness_%'
]
df[cols_pct] = df[cols_pct].astype(float)

#Convertendo o key e mode para category para economizar memoria
df['key'] = df['key'].astype('category')
df['mode'] = df['mode'].astype('category')


#Transformação das colunas de released em apenas uma coluna chamada releasedData
df['released_date'] = pd.to_datetime(
    df['released_year'].astype(str) + '-' +
    df['released_month'].astype(str) + '-' +
    df['released_day'].astype(str),
    errors='coerce'
)
df = df.drop(columns=['released_year', 'released_month', 'released_day'])

#tratando os dados inexistentes da coluna key
df['key'] = df['key'].cat.add_categories('Unknown')
df['key'] = df['key'].fillna('Unknown').astype('category')
#tratando os dados inexistentes da coluna in_shazam_charts
df.replace({'in_shazam_charts': ['', ' ', 'nan', 'NULL', 'NA']}, np.nan, inplace=True)
df = df.dropna(subset=['in_shazam_charts'])


df.head(8)



Unnamed: 0,track_name,artist(s)_name,artist_count,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,...,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,released_date
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,553,147,141381703,43,263,45,10,...,B,Major,80.0,89.0,83.0,31.0,0.0,8.0,4.0,2023-07-14
1,LALA,Myke Towers,1,1474,48,133716286,48,126,58,14,...,C#,Major,71.0,61.0,74.0,7.0,0.0,10.0,4.0,2023-03-23
2,vampire,Olivia Rodrigo,1,1397,113,140003974,94,207,91,14,...,F,Major,51.0,32.0,53.0,17.0,0.0,31.0,6.0,2023-06-30
3,Cruel Summer,Taylor Swift,1,7858,100,800840817,116,207,125,12,...,A,Major,55.0,58.0,72.0,11.0,0.0,11.0,15.0,2019-08-23
4,WHERE SHE GOES,Bad Bunny,1,3133,50,303236322,84,133,87,15,...,A,Minor,65.0,23.0,80.0,14.0,63.0,11.0,6.0,2023-05-18
5,Sprinter,"Dave, Central Cee",2,2186,91,183706234,67,213,88,17,...,C#,Major,92.0,66.0,58.0,19.0,0.0,8.0,24.0,2023-06-01
6,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2,3090,50,725980112,34,222,43,13,...,F,Minor,67.0,83.0,76.0,48.0,0.0,8.0,3.0,2023-03-16
7,Columbia,Quevedo,1,714,43,58149378,25,89,30,13,...,F,Major,67.0,26.0,71.0,37.0,0.0,11.0,4.0,2023-07-07


In [69]:
missing = df.isnull().sum().to_frame("faltantes")
missing["percentual"] = (missing["faltantes"] / len(df)) * 100
missing


Unnamed: 0,faltantes,percentual
track_name,0,0.0
artist(s)_name,0,0.0
artist_count,0,0.0
in_spotify_playlists,0,0.0
in_spotify_charts,0,0.0
streams,0,0.0
in_apple_playlists,0,0.0
in_apple_charts,0,0.0
in_deezer_playlists,0,0.0
in_deezer_charts,0,0.0


In [70]:
instancias, atributos = df.shape

print("Instâncias:", instancias)
print("Atributos:", atributos)
print("Periodo de tempo: 2023")


Instâncias: 903
Atributos: 22
Periodo de tempo: 2023


In [71]:
#verificação de linha duplicada
duplicatas = df.duplicated()
print("Número de linhas duplicadas:", duplicatas.sum())





Número de linhas duplicadas: 0
