## Manipulação e tranformação dos dados e arquivos

**Aprimorando a manipulação de arquivos por meio do Apache Spark e do formato Parquet, realizando a conversão de arquivos CSV para o formato Parquet. Essa abordagem não apenas melhora o desempenho, mas também oferece recursos adicionais.**

In [None]:
# listando os arquivos e diretórios
display(dbutils.fs.ls('dbfs:/FileStore/tables'))

path,name,size,modificationTime
dbfs:/FileStore/tables/aula-databricks/,aula-databricks/,0,0
dbfs:/FileStore/tables/dados/,dados/,0,0
dbfs:/FileStore/tables/dados_spotify/,dados_spotify/,0,0
dbfs:/FileStore/tables/dados_spotify_tratados/,dados_spotify_tratados/,0,0
dbfs:/FileStore/tables/data.csv,data.csv,2781,1705440822000
dbfs:/FileStore/tables/explorando_tipos_arquivos/,explorando_tipos_arquivos/,0,0


In [None]:
# listando os arquivos e diretórios
display(dbutils.fs.ls('dbfs:/FileStore/tables/dados_spotify'))

path,name,size,modificationTime
dbfs:/FileStore/tables/dados_spotify/data.csv,data.csv,29654587,1706802690000
dbfs:/FileStore/tables/dados_spotify/data_by_artist.csv,data_by_artist.csv,4315607,1706802680000
dbfs:/FileStore/tables/dados_spotify/data_by_genres.csv,data_by_genres.csv,576456,1706802681000
dbfs:/FileStore/tables/dados_spotify/data_by_year.csv,data_by_year.csv,21194,1706802682000
dbfs:/FileStore/tables/dados_spotify/data_w_genres.csv,data_w_genres.csv,5224673,1706802686000


In [None]:
# realizando leitura inicial de arquivo csv
dbutils.fs.head('dbfs:/FileStore/tables/dados_spotify/data_by_year.csv')

Out[3]: 'mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key\n1,1921,0.8868960000000005,0.4185973333333336,260537.16666666663,0.23181513333333334,0.34487805886666656,0.20571,-17.04866666666665,0.073662,101.53149333333329,0.37932666666666665,0.6533333333333333,2\n1,1922,0.9385915492957748,0.4820422535211267,165469.74647887325,0.23781535211267596,0.4341948697183099,0.2407197183098592,-19.275281690140844,0.1166549295774648,100.88452112676056,0.5355492957746479,0.14084507042253522,10\n1,1923,0.9572467913513516,0.5773405405405401,177942.36216216214,0.2624064864864865,0.37173272502702703,0.2274621621621621,-14.129210810810811,0.0939486486486487,114.0107297297297,0.6254924324324328,5.389189189189189,0\n1,1924,0.940199860169493,0.5498940677966102,191046.70762711862,0.3443466101694912,0.5817009136440677,0.2352190677966101,-14.231343220338989,0.09208940677966099,120.68957203389822,0.6637254237288139,0.6610169491525424

In [None]:
# salvando caminho do arquivo em uma variável
caminho_data = 'dbfs:/FileStore/tables/dados_spotify/data_by_year.csv'

In [None]:
# salvando os dados no dataframe do spark
df_data_year = spark.read.csv(caminho_data, inferSchema=True, header=True)

In [None]:
# carregando as primeiras linhas
df_data_year.show()

+----+----+------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+---+
|mode|year|      acousticness|       danceability|       duration_ms|             energy|   instrumentalness|           liveness|           loudness|        speechiness|             tempo|            valence|         popularity|key|
+----+----+------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+---+
|   1|1921|0.8868960000000005| 0.4185973333333336|260537.16666666663|0.23181513333333334|0.34487805886666656|            0.20571| -17.04866666666665|           0.073662|101.53149333333329|0.37932666666666665| 0.6533333333333333|  2|
|   1|1922|0.9385915492957748| 0.4820422535211267|165469.74647887325

In [None]:
# consultando o tipo de dados do dataframe
type(df_data_year)

Out[8]: pyspark.sql.dataframe.DataFrame

In [None]:
# convertendo o DataFrame do Spark para um DataFrame do Pandas
df_data_year = df_data_year.pandas_api()

In [None]:
# consultando o tipo de dados do dataframe
type(df_data_year)

Out[10]: pyspark.pandas.frame.DataFrame

In [None]:
# realizando a leitura das primeiras linhas do dataframe Pandas otimizado para Spark
df_data_year.head()

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,1921,0.886896,0.418597,260537.166667,0.231815,0.344878,0.20571,-17.048667,0.073662,101.531493,0.379327,0.653333,2
1,1,1922,0.938592,0.482042,165469.746479,0.237815,0.434195,0.24072,-19.275282,0.116655,100.884521,0.535549,0.140845,10
2,1,1923,0.957247,0.577341,177942.362162,0.262406,0.371733,0.227462,-14.129211,0.093949,114.01073,0.625492,5.389189,0
3,1,1924,0.9402,0.549894,191046.707627,0.344347,0.581701,0.235219,-14.231343,0.092089,120.689572,0.663725,0.661017,10
4,1,1925,0.962607,0.573863,184986.92446,0.278594,0.418297,0.237668,-14.146414,0.111918,115.521921,0.621929,2.604317,5


In [None]:
# retirando uma amostra da base de dados para realizar tentativas de transformação
df_data_year.info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              100 non-null    int32  
 1   year              100 non-null    int32  
 2   acousticness      100 non-null    float64
 3   danceability      100 non-null    float64
 4   duration_ms       100 non-null    float64
 5   energy            100 non-null    float64
 6   instrumentalness  100 non-null    float64
 7   liveness          100 non-null    float64
 8   loudness          100 non-null    float64
 9   speechiness       100 non-null    float64
 10  tempo             100 non-null    float64
 11  valence           100 non-null    float64
 12  popularity        100 non-null    float64
 13  key               100 non-null    int32  
dtypes: float64(11), int32(3)

In [None]:
# listando os arquivos e diretórios
display(dbutils.fs.ls('dbfs:/FileStore/tables/dados_spotify_tratados'))

path,name,size,modificationTime
dbfs:/FileStore/tables/dados_spotify_tratados/data.parquet/,data.parquet/,0,0


In [None]:
# salvando o DataFrame em formato Parquet no diretório especificado
df_data_year.to_parquet('dbfs:/FileStore/tables/dados_spotify_tratados/data_year.parquet')


In [None]:
# listando os arquivos e diretórios
display(dbutils.fs.ls('dbfs:/FileStore/tables/dados_spotify_tratados'))

path,name,size,modificationTime
dbfs:/FileStore/tables/dados_spotify_tratados/data.parquet/,data.parquet/,0,0
dbfs:/FileStore/tables/dados_spotify_tratados/data_year.parquet/,data_year.parquet/,0,0


In [None]:
# realizando leitura inicial de arquivo csv
dbutils.fs.head('dbfs:/FileStore/tables/dados_spotify/data_by_artist.csv')

[Truncated to first 65536 bytes]
Out[1]: 'mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key\n1,9,0.5901111111111111,"""Cats"" 1981 Original London Cast",0.4672222222222222,250318.5555555556,0.3940033333333333,0.011399851111111107,0.2908333333333333,-14.448,0.21038888888888888,117.51811111111112,0.3895,38.333333333333336,5\n1,26,0.8625384615384617,"""Cats"" 1983 Broadway Cast",0.4417307692307693,287280.0,0.4068076923076923,0.08115826423076923,0.3152153846153846,-10.69,0.17621153846153847,103.04415384615385,0.2688653846153846,30.57692307692308,5\n1,7,0.8565714285714285,"""Fiddler On The Roof” Motion Picture Chorus",0.34828571428571425,328920.0,0.2865714285714285,0.024592948571428568,0.3257857142857143,-15.230714285714285,0.1185142857142857,77.37585714285714,0.3548571428571429,34.857142857142854,0\n1,27,0.884925925925926,"""Fiddler On The Roof” Motion Picture Orchestra",0.4250740740740739,262890.9629

In [None]:
# salvando caminho do arquivo em uma variável
caminho_data = 'dbfs:/FileStore/tables/dados_spotify/data_by_artist.csv'

In [None]:
# salvando os dados no dataframe do spark
df_data_artist = spark.read.csv(caminho_data, inferSchema=True, header=True)

In [None]:
# carregando as primeiras linhas
df_data_artist.show()

+----+-----+------------------+--------------------+-------------------+------------------+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+-------------------+------------------+---+
|mode|count|      acousticness|             artists|       danceability|       duration_ms|             energy|    instrumentalness|           liveness|           loudness|        speechiness|             tempo|            valence|        popularity|key|
+----+-----+------------------+--------------------+-------------------+------------------+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+-------------------+------------------+---+
|   1|    9|0.5901111111111111|"""Cats"" 1981 Or...| 0.4672222222222222| 250318.5555555556| 0.3940033333333333|0.011399851111111107| 0.2908333333333333|            -14.448|0.21038888888888888|117.51811111111112|             0.3895|38.3

In [None]:
# consultando o tipo de dados do dataframe
type(df_data_artist)

Out[5]: pyspark.sql.dataframe.DataFrame

In [None]:
# convertendo o DataFrame do Spark para um DataFrame do Pandas
df_data_artist = df_data_artist.pandas_api()

In [None]:
# consultando o novo tipo de dados do dataframe
type(df_data_artist)

Out[7]: pyspark.pandas.frame.DataFrame

In [None]:
# realizando a leitura das primeiras linhas do dataframe Pandas otimizado para Spark
df_data_artist.head()

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,9,0.590111,"""""""Cats"""" 1981 Original London Cast""",0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5
1,1,26,0.862538,"""""""Cats"""" 1983 Broadway Cast""",0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5
2,1,7,0.856571,"""""""Fiddler On The Roof” Motion Picture Chorus""",0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0
3,1,27,0.884926,"""""""Fiddler On The Roof” Motion Picture Orchestra""",0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0
4,1,7,0.510714,"""""""Joseph And The Amazing Technicolor Dreamcoa...",0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5


In [None]:
# retirando uma amostra da base de dados para realizar tentativas de transformação
x_test = df_data_artist.iloc[0:5]
x_test

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,9,0.590111,"""""""Cats"""" 1981 Original London Cast""",0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5
1,1,26,0.862538,"""""""Cats"""" 1983 Broadway Cast""",0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5
2,1,7,0.856571,"""""""Fiddler On The Roof” Motion Picture Chorus""",0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0
3,1,27,0.884926,"""""""Fiddler On The Roof” Motion Picture Orchestra""",0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0
4,1,7,0.510714,"""""""Joseph And The Amazing Technicolor Dreamcoa...",0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5


In [None]:
# limpeza dos dados da amostra substituindo todas as aspas duplas por uma string vazia
x_test['artists'] = x_test.artists.str.replace('"', '')
x_test

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,9,0.590111,Cats 1981 Original London Cast,0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5
1,1,26,0.862538,Cats 1983 Broadway Cast,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5
2,1,7,0.856571,Fiddler On The Roof” Motion Picture Chorus,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0
3,1,27,0.884926,Fiddler On The Roof” Motion Picture Orchestra,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0
4,1,7,0.510714,Joseph And The Amazing Technicolor Dreamcoat 1...,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5


In [None]:
# aplicando a transformação no conjunto de dados completo
df_data_artist['artists'] = df_data_artist.artists.str.replace('"', '')
df_data_artist.head()

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,9,0.590111,Cats 1981 Original London Cast,0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5
1,1,26,0.862538,Cats 1983 Broadway Cast,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5
2,1,7,0.856571,Fiddler On The Roof” Motion Picture Chorus,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0
3,1,27,0.884926,Fiddler On The Roof” Motion Picture Orchestra,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0
4,1,7,0.510714,Joseph And The Amazing Technicolor Dreamcoat 1...,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5


In [None]:
# obtendo informações sobre o dataframe, como número de linhas e colunas, tipos de dados e se há valores núlos
df_data_artist.info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 28680 entries, 0 to 28679
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              28680 non-null  int32  
 1   count             28680 non-null  int32  
 2   acousticness      28680 non-null  float64
 3   artists           28680 non-null  object 
 4   danceability      28680 non-null  float64
 5   duration_ms       28680 non-null  float64
 6   energy            28680 non-null  float64
 7   instrumentalness  28680 non-null  float64
 8   liveness          28680 non-null  float64
 9   loudness          28680 non-null  float64
 10  speechiness       28680 non-null  float64
 11  tempo             28680 non-null  float64
 12  valence           28680 non-null  float64
 13  popularity        28680 non-null  float64
 14  key               28680 non-null  int32  
dtypes: float64(11), int32(3), object(1)

In [None]:
# listando os arquivos e diretórios
display(dbutils.fs.ls('dbfs:/FileStore/tables/dados_spotify_tratados'))

path,name,size,modificationTime
dbfs:/FileStore/tables/dados_spotify_tratados/data.parquet/,data.parquet/,0,0
dbfs:/FileStore/tables/dados_spotify_tratados/data_year.parquet/,data_year.parquet/,0,0


In [None]:
# salvando dataframe no formato .parquet
df_data_artist.to_parquet('dbfs:/FileStore/tables/dados_spotify_tratados/data_artist.parquet')

In [None]:
# validando a criação do novo arquivo .parquet
display(dbutils.fs.ls('dbfs:/FileStore/tables/dados_spotify_tratados'))

path,name,size,modificationTime
dbfs:/FileStore/tables/dados_spotify_tratados/data.parquet/,data.parquet/,0,0
dbfs:/FileStore/tables/dados_spotify_tratados/data_artist.parquet/,data_artist.parquet/,0,0
dbfs:/FileStore/tables/dados_spotify_tratados/data_year.parquet/,data_year.parquet/,0,0


**Repetindo a transformação para o arquivo 'data_by_genres.csv'**

In [None]:
# salvando caminho do arquivo em uma variável
caminho_data = 'dbfs:/FileStore/tables/dados_spotify/data_by_genres.csv'

In [None]:
# salvando os dados no dataframe do spark
df_data_genres = spark.read.csv(caminho_data, inferSchema=True, header=True)

In [None]:
# convertendo o DataFrame do Spark para um DataFrame do Pandas
df_data_genres = df_data_genres.pandas_api()

In [None]:
# leitura das primeiras linhas do dataframe Pandas otimizado para Spark
df_data_genres.head()

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333,6
1,1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5,5
2,1,8-bit,0.762,0.712,115177.0,0.818,0.876,0.126,-9.18,0.047,133.444,0.975,48.0,7
3,1,[],0.651417,0.529093,232880.9,0.419146,0.205309,0.218696,-12.288965,0.107872,112.857352,0.513604,20.859882,7
4,1,a cappella,0.676557,0.538961,190628.5,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071,7


In [None]:
# obtendo informações sobre o dataframe, como número de linhas e colunas, tipos de dados e se há valores núlos
df_data_genres.info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 2973 entries, 0 to 2972
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              2973 non-null   int32  
 1   genres            2973 non-null   object 
 2   acousticness      2973 non-null   float64
 3   danceability      2973 non-null   float64
 4   duration_ms       2973 non-null   float64
 5   energy            2973 non-null   float64
 6   instrumentalness  2973 non-null   float64
 7   liveness          2973 non-null   float64
 8   loudness          2973 non-null   float64
 9   speechiness       2973 non-null   float64
 10  tempo             2973 non-null   float64
 11  valence           2973 non-null   float64
 12  popularity        2973 non-null   float64
 13  key               2973 non-null   int32  
dtypes: float64(11), int32(2), object(1)

In [None]:
# salvando o DataFrame em formato Parquet no diretório especificado
df_data_genres.to_parquet('dbfs:/FileStore/tables/dados_spotify_tratados/data_genres.parquet')

**Repetindo a transformação para o arquivo 'data_w_genres.csv'**

In [None]:
# salvando caminho do arquivo em uma variável
caminho_data = 'dbfs:/FileStore/tables/dados_spotify/data_w_genres.csv'

In [None]:
# salvando os dados no dataframe do spark
df_data_w_genres = spark.read.csv(caminho_data, inferSchema=True, header=True)

In [None]:
# convertendo o DataFrame do Spark para um DataFrame do Pandas
df_data_w_genres = df_data_w_genres.pandas_api()

In [None]:
# realizando a leitura das primeiras linhas do dataframe Pandas otimizado para Spark
df_data_w_genres.head()

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],"""""""Cats"""" 1981 Original London Cast""",0.5901111111111111,0.4672222222222222,250318.5555555556,0.3940033333333333,0.0113998511111111,0.2908333333333333,-14.448,0.210389,117.518111,0.3895,38.333333,5.0,1.0,9.0
1,[],"""""""Cats"""" 1983 Broadway Cast""",0.8625384615384617,0.4417307692307693,287280.0,0.4068076923076923,0.0811582642307692,0.3152153846153846,-10.69,0.176212,103.044154,0.268865,30.576923,5.0,1.0,26.0
2,[],"""""""Fiddler On The Roof” Motion Picture Chorus""",0.8565714285714285,0.3482857142857142,328920.0,0.2865714285714285,0.0245929485714285,0.3257857142857143,-15.230714,0.118514,77.375857,0.354857,34.857143,0.0,1.0,7.0
3,[],"""""""Fiddler On The Roof” Motion Picture Orchestra""",0.884925925925926,0.4250740740740739,262890.96296296304,0.2457703703703704,0.0735872792592592,0.2754814814814815,-15.63937,0.1232,88.66763,0.37203,34.851852,0.0,1.0,27.0
4,[],"""""""Joseph And The Amazing Technicolor Dreamcoa...",0.5107142857142857,0.4671428571428572,270436.14285714284,0.4882857142857143,0.0094002914285714,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5.0,1.0,7.0


In [None]:
# obtendo informações sobre o dataframe, como número de linhas e colunas, tipos de dados e se há valores núlos
df_data_w_genres.info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 28680 entries, 0 to 28679
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genres            28680 non-null  object 
 1   artists           28680 non-null  object 
 2   acousticness      28680 non-null  object 
 3   danceability      28680 non-null  object 
 4   duration_ms       28680 non-null  object 
 5   energy            28680 non-null  object 
 6   instrumentalness  28680 non-null  object 
 7   liveness          28680 non-null  object 
 8   loudness          28680 non-null  float64
 9   speechiness       28680 non-null  float64
 10  tempo             28680 non-null  float64
 11  valence           28680 non-null  float64
 12  popularity        28680 non-null  float64
 13  key               28680 non-null  float64
 14  mode              28680 non-null  float64
 15  count             28680 non-null  float64
dtypes: float64(8), object(8)

In [None]:
# aplicando as transformações no conjunto de dados completo
df_data_w_genres['genres'] = df_data_w_genres.genres.str.replace("\[|\]|\'", "")
df_data_w_genres['artists'] = df_data_w_genres.artists.str.replace('"', '')

In [None]:
# consultando dados após transformação
df_data_w_genres.head()

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,show tunes,Cats 1981 Original London Cast,0.5901111111111111,0.4672222222222222,250318.5555555556,0.3940033333333333,0.0113998511111111,0.2908333333333333,-14.448,0.210389,117.518111,0.3895,38.333333,5.0,1.0,9.0
1,,Cats 1983 Broadway Cast,0.8625384615384617,0.4417307692307693,287280.0,0.4068076923076923,0.0811582642307692,0.3152153846153846,-10.69,0.176212,103.044154,0.268865,30.576923,5.0,1.0,26.0
2,,Fiddler On The Roof” Motion Picture Chorus,0.8565714285714285,0.3482857142857142,328920.0,0.2865714285714285,0.0245929485714285,0.3257857142857143,-15.230714,0.118514,77.375857,0.354857,34.857143,0.0,1.0,7.0
3,,Fiddler On The Roof” Motion Picture Orchestra,0.884925925925926,0.4250740740740739,262890.96296296304,0.2457703703703704,0.0735872792592592,0.2754814814814815,-15.63937,0.1232,88.66763,0.37203,34.851852,0.0,1.0,27.0
4,,Joseph And The Amazing Technicolor Dreamcoat 1...,0.5107142857142857,0.4671428571428572,270436.14285714284,0.4882857142857143,0.0094002914285714,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5.0,1.0,7.0


In [None]:
# salvando o DataFrame em formato Parquet no diretório especificado
df_data_w_genres.to_parquet('dbfs:/FileStore/tables/dados_spotify_tratados/data_w_genres.parquet')

In [None]:
# listando os arquivos transformados para o formato .parquet
display(dbutils.fs.ls('dbfs:/FileStore/tables/dados_spotify_tratados'))

path,name,size,modificationTime
dbfs:/FileStore/tables/dados_spotify_tratados/data.parquet/,data.parquet/,0,0
dbfs:/FileStore/tables/dados_spotify_tratados/data_artist.parquet/,data_artist.parquet/,0,0
dbfs:/FileStore/tables/dados_spotify_tratados/data_genres.parquet/,data_genres.parquet/,0,0
dbfs:/FileStore/tables/dados_spotify_tratados/data_w_genres.parquet/,data_w_genres.parquet/,0,0
dbfs:/FileStore/tables/dados_spotify_tratados/data_year.parquet/,data_year.parquet/,0,0
