In [57]:
import pandas as pd
import numpy as np
import re
pd.options.display.max_columns = None

# data input
- mis fuentes:
  
https://www.kaggle.com/datasets/fcpercival/160k-spotify-songs-sorted?resource=download&select=data.csv

https://www.kaggle.com/datasets/asifsadmine/spotify-playlists-dataset

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-audio-features

In [32]:
df = pd.read_csv('datos/data.csv')
df.head()

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,mode,key,popularity,explicit
0,0gNNToCW3qjabgTyBSjt3H,!Que Vida! - Mono Version,['Love'],220560,11/1/66,1966,0.525,0.6,0.54,0.00305,0.1,-11.803,0.0328,125.898,0.547,1,9,26,0
1,0tMgFpOrXZR6irEOLNWwJL,"""40""",['U2'],157840,2/28/83,1983,0.228,0.368,0.48,0.707,0.159,-11.605,0.0306,150.166,0.338,1,8,21,0
2,2ZywW3VyVx6rrlrX75n3JB,"""40"" - Live",['U2'],226200,8/20/83,1983,0.0998,0.272,0.684,0.0145,0.946,-9.728,0.0505,143.079,0.279,1,8,41,0
3,6DdWA7D1o5TU2kXWyCLcch,"""40"" - Remastered 2008",['U2'],157667,2/28/83,1983,0.185,0.371,0.545,0.582,0.183,-9.315,0.0307,150.316,0.31,1,8,37,0
4,3vMmwsAiLDCfyc1jl76lQE,"""40"" - Remastered 2008",['U2'],157667,2/28/83,1983,0.185,0.371,0.545,0.582,0.183,-9.315,0.0307,150.316,0.31,1,8,35,0


# análisis exploratorio
mi dataset tiene los siguientes campos:
- `id` -- tipo `object`. id único de cada canción para Spotify.
- `name` -- tipo `object`. nombre de la canción 
- `artists` -- tipo `object`. nombre del intérprete, puede tener caracteres de tipo cilírico
- `duration_ms` -- tipo `np.int64`. campo numérico, indica la longitud de la canción en ms
- `release_date` -- tipo `object`. fecha de publicación del tema. se va a tener que recastear y limpiar para poder homogeneizarlo.
- `acousticness` -- tipo `np.float64`. puntúa del 0 al 1 si la canción es acústica. Cuanto mayor sea, más bailable es el tema.
- `danceability` -- tipo `np.float64`. puntúa del 0 al 1 si la canción es bailable, basándose en criterios de tempo, ritmo estable, potencia del beat y regularidad. Cuanto mayor el valor, más bailable es el tema.
- `energy` -- tipo `np.float64`. puntúa del 0 al 1 si la canción es "energética"; se evalua la intensidad y la actividad de la canción, teniendo en cuenta factores como la intensidad y la rapidez del tempo del tema.
- `instrumentalness` -- tipo `np.float64`. puntúa del 0 al 1 si se trata de una canción con o sin cantantes. Valores por encima de 0.5 consideran que la pisa es instrumental.
- `liveness` -- tipo `np.float64`. puntúa del 0 al 1 si la cancion se trata de una grabación en directo o de estudio. En torno a 0.8 indica que se trata de una pista grabada en directo.
- `loudness` -- tipo `np.float64`. indica en decibelios (db) la intensidad sonora de la pista, puntuando la "potencia" del sonido. El valor de los decibelios es la media de la pista, pudiendo variar entre -60  0.
- `speechiness` -- tipo `np.float64`. puntúa del 0 al 1 la presencia de palabras recitadas en una pista -- según la cantidad de texto hablado que aparece en la pista. A partir del umbral del 0.66 se considera que la pista está compuesta mayormente de palabras recitadas. Si los valores varían entre 0.33 y 0.66 se puede considerar que contiene tanto música como palabras, en secciones o superpuestas, como pueda ser el caso de una canción de rap. Valores por debajo de 0.33 indican que la canción es mayormente instrumental y no-recitada.
- `tempo` -- tipo `np.float64`. estimación del tempo de una pista en beats per minute (BPM). En terminología musical, el tempo indica la velocidad de la canción. Puede variar entre 0 a 245BPM.
- `valence` -- tipo `np.float64`. valora del 0 al 1 la "positividad" que tiene la pista. Canciones con una `valence` alta suenan más positivas (ej. felices, eufóricas, alegres), que canciones con un valor de `valence` inferior que se perciben más negativas (ej. triste, depresivas, enfadada)
- `mode` -- tipo `np.int64`. indica el modo de la canción (mayor = 1, menor = 0)
- `key` -- tipo `np.int64`. tono en el que está la canción, puntuándolo de 0 a 11, siguiendo la notación indicada aquí: https://en.wikipedia.org/wiki/Pitch_class (Ej. 0 = C, 1 = C#//D♭). Si no se detecta el tono, se considerará el valor -1.
- `popularity`  -- tipo `np.int64`. popularidad de la canción
- `explicit` -- tipo `np.int64`. indica si la canción tiene o no letras explícitas.

## primero voy a mirar los tipos de datos que tiene cada columna:

In [33]:
df.dtypes

id                   object
name                 object
artists              object
duration_ms           int64
release_date         object
year                  int64
acousticness        float64
danceability        float64
energy              float64
instrumentalness    float64
liveness            float64
loudness            float64
speechiness         float64
tempo               float64
valence             float64
mode                  int64
key                   int64
popularity            int64
explicit              int64
dtype: object

todos los campos están bien casteados menos el de fecha, que será corregido a continuación

## vamos a ver si el dataframe tuviera algún nulo con el método `info()`

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169907 entries, 0 to 169906
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                169907 non-null  object 
 1   name              169907 non-null  object 
 2   artists           169907 non-null  object 
 3   duration_ms       169907 non-null  int64  
 4   release_date      169907 non-null  object 
 5   year              169907 non-null  int64  
 6   acousticness      169907 non-null  float64
 7   danceability      169907 non-null  float64
 8   energy            169907 non-null  float64
 9   instrumentalness  169907 non-null  float64
 10  liveness          169907 non-null  float64
 11  loudness          169907 non-null  float64
 12  speechiness       169907 non-null  float64
 13  tempo             169907 non-null  float64
 14  valence           169907 non-null  float64
 15  mode              169907 non-null  int64  
 16  key               16

no parece que ninguna de las columnas tenga duplicados.

## check de los valores duplicados de df con `duplicated()`

In [35]:
df.duplicated(subset = df.columns).sum()

0

## vamos a mirar las estadísticas características de cada una de las columnas mediante `describe()`

In [36]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,169907.0,169907.0,0gNNToCW3qjabgTyBSjt3H,1.0,,,,,,,
name,169907.0,132938.0,Summertime,62.0,,,,,,,
artists,169907.0,33375.0,['Эрнест Хемингуэй'],1215.0,,,,,,,
duration_ms,169907.0,,,,231407.085988,121322.336392,5108.0,171040.0,208600.0,262966.5,5403500.0
release_date,169907.0,10882.0,1945,1449.0,,,,,,,
year,169907.0,,,,1977.223234,25.593318,1921.0,1957.0,1978.0,1999.0,2020.0
acousticness,169907.0,,,,0.493217,0.376628,0.0,0.0945,0.492,0.888,0.996
danceability,169907.0,,,,0.538147,0.175345,0.0,0.417,0.548,0.667,0.988
energy,169907.0,,,,0.488591,0.267391,0.0,0.263,0.481,0.71,1.0
instrumentalness,169907.0,,,,0.161939,0.30933,0.0,0.0,0.000204,0.0868,1.0


In [37]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration_ms,169907.0,231407.085988,121322.336392,5108.0,171040.0,208600.0,262966.5,5403500.0
year,169907.0,1977.223234,25.593318,1921.0,1957.0,1978.0,1999.0,2020.0
acousticness,169907.0,0.493217,0.376628,0.0,0.0945,0.492,0.888,0.996
danceability,169907.0,0.538147,0.175345,0.0,0.417,0.548,0.667,0.988
energy,169907.0,0.488591,0.267391,0.0,0.263,0.481,0.71,1.0
instrumentalness,169907.0,0.161939,0.30933,0.0,0.0,0.000204,0.0868,1.0
liveness,169907.0,0.206692,0.176797,0.0,0.0984,0.135,0.263,1.0
loudness,169907.0,-11.370311,5.666795,-60.0,-14.47,-10.474,-7.118,3.855
speechiness,169907.0,0.094058,0.149938,0.0,0.0349,0.045,0.0754,0.969
tempo,169907.0,116.94785,30.727079,0.0,93.516,114.777,135.712,244.091


In [38]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
id,169907,169907,0gNNToCW3qjabgTyBSjt3H,1
name,169907,132938,Summertime,62
artists,169907,33375,['Эрнест Хемингуэй'],1215
release_date,169907,10882,1945,1449


# cast a las fechas
ya se observa que el campo `release_date` se puede castear como fecha, vamos a ver si todos los campos tienen la misma estructura:

In [39]:
df['release_date'].value_counts()

1945       1449
1949       1254
1935       1123
1948       1052
1/1/30     1047
           ... 
4/16/01       1
2/22/03       1
3/30/15       1
2/21/89       1
4/19/04       1
Name: release_date, Length: 10882, dtype: int64

hay años que no tienen día -> habrá que suponer la fecha, para poder generar un nuevo campo `date` que tenga el formato adecuado. podemos intentar ver posibilidades calculando la longitd de los datos que se almacenan en el campo `release_date`.

In [40]:
df['release_date_length'] = df['release_date'].apply(len)

In [41]:
df['release_date_length'].value_counts()

6    52683
4    50382
7    50212
8    16630
Name: release_date_length, dtype: int64

In [42]:
df_fechas = df['release_date_length'].value_counts().reset_index()
df_fechas['ratio'] = df_fechas['release_date_length'] / df.shape[0]
df_fechas

Unnamed: 0,index,release_date_length,ratio
0,6,52683,0.31007
1,4,50382,0.296527
2,7,50212,0.295526
3,8,16630,0.097877


In [43]:
df[df['release_date_length'] == 4].head(1)

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,mode,key,popularity,explicit,release_date_length
6,3mdPsKBeXxC4N6oKKZeBNy,"""8"" Teen",['? & The Mysterians'],166693,1966,1966,0.111,0.657,0.872,0.713,0.0857,-7.279,0.0512,136.512,0.738,1,2,17,0,4


In [44]:
df[df['release_date_length'] == 6].sample(5)

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,mode,key,popularity,explicit,release_date_length
56985,0vOkmmJEtjuFZDzrQSFzEE,Hit Me With Your Best Shot,['Pat Benatar'],171267,8/5/80,1980,0.117,0.741,0.58,3.3e-05,0.212,-9.05,0.0304,127.402,0.944,1,4,69,0,6
111631,7rkHACbUWdirRZ3lrHCxET,Quiero Charlar Con La Muerte,['Valentín Elizalde'],184000,1/1/05,2005,0.631,0.751,0.522,1.9e-05,0.231,-6.159,0.0462,105.125,0.968,1,3,56,0,6
80538,2zEuvBRN52zTtaKVVyne20,Lifestyles Of The Not So Rich And Famous,['Tracy Byrd'],171363,1/1/94,1994,0.287,0.715,0.689,0.0,0.0599,-9.716,0.0328,138.127,0.811,1,4,40,0,6
94723,5uattXQqELLUodThUant0Y,My Ship - Mono Version,['Miles Davis'],270760,1/1/57,1957,0.992,0.106,0.0325,0.934,0.207,-21.328,0.0366,72.979,0.04,0,2,12,0,6
33548,30yySOIXHZYx94sLAySy2j,Dil Se Re,"['A.R. Rahman', 'Anuradha', 'Anupama']",405707,7/8/98,1998,0.00882,0.844,0.442,0.000498,0.0879,-16.54,0.0691,105.986,0.436,1,9,53,0,6


In [45]:
df[df['release_date_length'] == 7].sample(5)

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,mode,key,popularity,explicit,release_date_length
65450,3OMIRD5ctTKGxtPFswyD2I,I've Been Loving You Too Long - Remastered Liv...,['Otis Redding'],363680,9/15/65,1965,0.479,0.464,0.45,0.0,0.956,-8.311,0.0456,122.321,0.327,1,5,22,0,7
142908,6jdEMl5p7KD9yBfqNm0ACH,The Symphony,"['Marley Marl', 'Big Daddy Kane', 'Craig G', '...",364800,9/28/88,1988,0.00633,0.867,0.597,0.0,0.279,-8.576,0.254,94.659,0.393,1,7,33,1,7
100100,0yy8QwH9OOgeF3CBeUDguR,Odd future,['UVERworld'],225907,7/27/18,2018,0.015,0.694,0.795,0.0,0.114,-4.649,0.0924,128.036,0.556,0,0,64,0,7
162609,3LeNzetyINNwZLOR7O5lNR,Yo Vendo Unos Ojos Negros,['Nat King Cole'],142733,9/13/59,1959,0.475,0.62,0.375,0.0,0.376,-9.863,0.0412,114.219,0.612,1,0,30,0,7
18832,3uleUeZ5eyYCNGiwcc1Exp,Breaking Skin,['Nonpoint'],174547,9/30/14,2014,9.7e-05,0.508,0.926,0.019,0.269,-2.652,0.0487,104.937,0.441,0,5,52,0,7


In [46]:
df[df['release_date_length'] == 8].sample(5)

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,mode,key,popularity,explicit,release_date_length
115518,5knuzwU65gJK7IF5yJsuaW,Rockabye (feat. Sean Paul & Anne-Marie),"['Clean Bandit', 'Sean Paul', 'Anne-Marie']",251088,10/21/16,2016,0.406,0.72,0.763,0.0,0.18,-4.068,0.0523,101.965,0.742,0,9,76,0,8
115210,6HU6QqXPLKocjhyqIdZjP6,Robin's Horn,['Candy Johnson'],182146,11/15/48,1948,0.99,0.44,0.471,0.899,0.305,-12.873,0.0746,74.231,0.743,0,5,0,0,8
105999,3CWgv82JBVeL9KX48BNLmb,Perdoname,['Alex Bueno'],254240,10/27/98,1998,0.2,0.939,0.511,0.0,0.0951,-5.429,0.0567,125.031,0.942,1,8,49,0,8
26331,3UhRsQVrELOATEiY7E9X2T,Come Closer (feat. Queen Naija),"['A Boogie Wit da Hoodie', 'Queen Naija']",156512,12/21/18,2018,0.22,0.741,0.556,0.0,0.0981,-6.647,0.0947,140.056,0.616,0,7,66,1,8
154204,7IAa7vUJ11STN7le8XaxsH,WTF (Where They From) [feat. Pharrell Williams],"['Missy Elliott', 'Pharrell Williams']",192773,11/12/15,2015,0.0181,0.932,0.819,7e-06,0.0577,-3.484,0.203,119.941,0.556,0,8,56,1,8


según la longitud:
- `4`: solo tenemos información del año.
- `6`: tenemos información de año, separadores y mes y día. en este caso, los meses y días tienen solo una cifra.
- `7`: tenemos información de año, mes y día. o el mes o el día tienen 2 cifras.
- `8`: tenemos información de año, mes y día. todos tienen 2 cifras.

además hay que comprobar el tipo de delimitador que se usa para la fecha

In [47]:
print((1 - (df.shape[0] - df['release_date'].str.contains('-').sum())/df.shape[0])*100, 
    (1 - (df.shape[0] - df['release_date'].str.contains('/').sum())/df.shape[0])*100)

0.9705309375128701 69.3767767072575


se va a actuar de formas distintas según la longitud del campo `release_date`

In [48]:
df['month'] = ''
df['day'] = ''

In [70]:
conditions = [(df['release_date_length'] == 6) & ('/' in df['release_date'])
    , (df['release_date_length'] == 7) & ('-' in df['release_date'])
    , (df['release_date_length'] == 7) & ('/' in df['release_date'])
    , (df['release_date_length'] == 8) & ('/' in df['release_date'])
    , (df['release_date_length'] == 4) 
    ]
choices_month = [df['release_date'].str.split('/', n=2, expand=True)[0]
    , df['release_date'].str.split('-', n=2, expand=True)[0]
    , df['release_date'].str.split('/', n=2, expand=True)[0]
    , df['release_date'].str.split('/', n=2, expand=True)[0]
    , df['month'].mode()[0]
]
choices_day = [df['release_date'].str.split('/', n=2, expand=True)[1]
    , df['release_date'].str.split('-', n=2, expand=True)[1]
    , df['release_date'].str.split('/', n=2, expand=True)[1]
    , df['release_date'].str.split('/', n=2, expand=True)[1]
    , 1
]
#choices_year = [df['release_date'].str.split('/', n=2, expand=True)[2]
#    , df['release_date'].str.split('-', n=2, expand=True)[2]
#    , df['release_date'].str.split('/', n=2, expand=True)[2]
#    , df['release_date'].str.split('/', n=2, expand=True)[2]
#    , df['release_date']
#]
df['month'] = np.select(conditions, choices_month, 'todo mal')
df['day'] = np.select(conditions, choices_day, 'todo mal')
#df['dacade'] = np.select(conditions, choices_year, 'todo mal')

In [74]:
'12/7/1989'.split('/', n=2, expand=True)[0]

TypeError: split() takes at most 2 arguments (3 given)

In [71]:
df['month'].unique()

array(['todo mal', ''], dtype=object)

In [72]:
df['day'].unique()

array(['todo mal', 1], dtype=object)

In [61]:
for z in range(0, df.shape[0]):
    if df.loc[z,'release_date_length'] == 6:
        if '-' in df.loc[z,'release_date']:
            df.loc[z,['month', 'day', 'decade']] = df['release_date'].str.split('-', expand=True)
            print('6-')
        elif '/' in df.loc[z,'release_date']:
            df.loc[z,['month', 'day', 'decade']] = df['release_date'].str.split('/', expand=True)
            print('6/')
    elif (df.loc[z,'release_date_length'] == 7) :
        if '-' in df.loc[z,'release_date']:
            df.loc[z,['month', 'day', 'decade']] = df['release_date'].str.split('-', expand=True)
            print('7-')
        elif '/' in df.loc[z,'release_date']:
            df.loc[z,['month', 'day', 'decade']] = df['release_date'].str.split('/', expand=True)
            print('7/')
    elif (df.loc[z,'release_date_length'] == 8) :
        if '-' in df.loc[z,'release_date']:
            df.loc[z,['month', 'day', 'decade']] = df['release_date'].str.split('-', expand=True)
            print('8-')
        elif '/' in df.loc[z,'release_date']:
            df.loc[z,['month', 'day', 'decade']] = df['release_date'].str.split('/', expand=True)
            print('8/')
    else:
        moda = df['month'].mode()[0]
        df.loc[z,'month'] = df['month'].mode()[0]
        df.loc[z,'day'] = 1
        df.loc[z,'decade'] = df.loc[z, 'release_date']
        print('4')

7/
7/
7/
7/
7/
6/
4
6/
4
6/
6/
7/
4
4
4
6/
6/
4
4
7/
6/
4
7/
4
7/
4
4
7/
4
4
4
4
6/
4
4
4
4
4
4
4
6/
4
6/
6/
4
6/
7/
7/
7/
7/
4
6/


KeyboardInterrupt: 

LO VIEJO

#### largo del campo es 6
como ya hemos dicho, el largo del campo es 4 -> el campo es directamente el año. forzaremos a que el primer día del mes sea el día de pulicación del disco.

In [63]:
df_06 = df[df['release_date_length'] == 6]
df_06['release_date'].value_counts()

1/1/30    1047
1/1/40    1008
1/1/50     935
1/1/55     770
1/1/61     646
          ... 
1/4/88       1
3/8/60       1
6/3/82       1
2/9/95       1
7/9/17       1
Name: release_date, Length: 2573, dtype: int64

In [64]:
### comprobar cual es el separador de los datos de fecha
print((1 - (df_06.shape[0] - df_06['release_date'].str.contains('-').sum())/df_06.shape[0])*100, 
    (1 - (df_06.shape[0] - df_06['release_date'].str.contains('/').sum())/df_06.shape[0])*100)

0.0 100.0


In [None]:
df_06[['month', 'day', 'decade']] = df_06['release_date'].str.split('/', expand=True)
df_06[['month', 'day', 'decade']].value_counts()

In [None]:
df_06['day'].unique()

In [None]:
df_06['month'].unique()

In [None]:
df_06['decade'].unique()

In [None]:
df_06['date'] = df_06['day'] + '/' + df_06['month'] + '/' + df_06['year'].astype(str)
df_06['date'].value_counts()

### largo del campo es 7
como ya hemos dicho, el largo del campo es 7 -> el campo contiene todos los elementos de fecha. forzaremos a que el primer día del mes sea el día de pulicación del disco.

In [None]:
df_07 = df[df['release_date_length'] == 7]
df_07['release_date'].value_counts()

In [None]:
### comprobar cual es el separador de los datos de fecha
print((1 - (df_07.shape[0] - df_07['release_date'].str.contains('-').sum())/df_07.shape[0])*100, 
    (1 - (df_07.shape[0] - df_07['release_date'].str.contains('/').sum())/df_07.shape[0])*100)

#### separador `-`
aparece el caracter `-` como separador, por lo que tendremos que separar el dataframe

In [None]:
df_07_guion = df_07[df_07['release_date'].str.contains('-')]
print(df_07_guion.shape)
df_07_guion.head(1)

In [None]:
df_07_guion[['decade', 'month']] = df_07_guion['release_date'].str.split('-', expand=True)
df_07_guion[['decade', 'month']] .value_counts()

In [None]:
df_07_guion['month'].unique()

In [None]:
df_07_guion['decade'].unique()

In [None]:
df_07_guion['date'] = '01/' + df_07_guion['month'] + '/' + df_07_guion['year'].astype(str)
df_07_guion['date'].value_counts()

#### separador `/`
ahora trataré los datos que tienene  `/` como separador

In [None]:
df_07_barra = df_07[df_07['release_date'].str.contains('/')]
print(df_07_barra.shape)
df_07_barra.head(1)

In [None]:
df_07_barra[['month', 'day', 'decade']] = df_07_barra['release_date'].str.split('/', expand=True)
df_07_barra[['month', 'day', 'decade']] .value_counts()

In [None]:
df_07_barra['month'].unique() ## el campo de me tiene que tener hasta 12 calores

In [None]:
df_07_barra['day'].unique() ## el campo de día tiene que tener hasta 31 valores

In [None]:
df_07_barra['decade'].unique()

In [None]:
df_07_barra['date'] = df_07_barra['day'] + '/' + df_07_barra['month'] + '/' + df_07_barra['year'].astype(str)
df_07_barra['date'].value_counts()

### largo del campo es 8
como ya hemos dicho, el largo del campo es 4 -> el campo es directamente el año. forzaremos a que el primer día del mes sea el día de pulicación del disco.

In [None]:
df_08 = df[df['release_date_length'] == 8]
df_08['release_date'].value_counts()

In [None]:
### comprobar cual es el separador de los datos de fecha
print((1 - (df_08.shape[0] - df_08['release_date'].str.contains('-').sum())/df_08.shape[0])*100, 
    (1 - (df_08.shape[0] - df_08['release_date'].str.contains('/').sum())/df_08.shape[0])*100)
##no hay valores raros, podemos continuar con el procedimiento habitual

In [None]:
df_08[['month', 'day', 'decade']] = df_08['release_date'].str.split('/', expand=True)
df_08[['month', 'day', 'decade']] .value_counts()

In [None]:
df_08['month'].unique() ## el campo de me tiene que tener hasta 12 calores

In [None]:
df_08['day'].unique() ## el campo de día tiene que tener hasta 31 valores

In [None]:
df_08['decade'].unique()

In [None]:
df_08['date'] = df_08['day'] + '/' + df_08['month'] + '/' + df_08['year'].astype(str)
df_08['date'].value_counts()

### largo del campo es 4
como ya hemos dicho, el largo del campo es 4 -> el campo es directamente el año. forzaremos a que el primer día del mes sea el día de pulicación del disco.

In [None]:
df_04 = df[df['release_date_length'] == 4]
df_04['release_date'].value_counts()

In [None]:
### comprobar cual es el separador de los datos de fecha -- > en este caso tiene que ser nulo
print((1 - (df_04.shape[0] - df_04['release_date'].str.contains('-').sum())/df_04.shape[0])*100, 
    (1 - (df_04.shape[0] - df_04['release_date'].str.contains('/').sum())/df_04.shape[0])*100)

In [None]:
print(df_04.shape)
df_04.sample(5)

en este caso no tenemos valores para `month` o para `day`. podemos suponer que el valor que tomará será la moda de los meses de los datos que ya hemos tratado.

In [None]:
moda = pd.concat([df_06, df_07_barra, df_07_guion, df_08], axis = 0)['month'].mode()[0]
moda

In [None]:
df_04['month'] = moda
df_04['month'].value_counts()

In [None]:
df_04['date'] = '01/' + df_04['month'].astype(str) + '/' + df_04['year'].astype(str)
df_04['date'].value_counts()

## concat values

In [None]:
df_fixed = pd.concat([df_04, df_06, df_07_barra, df_07_guion, df_08])
print(df_fixed.shape)
df_fixed.head(1)

In [None]:
df_fixed['date'] = pd.to_datetime(df_fixed['date'], dayfirst=True)
df_fixed['day'] = df_fixed['date'].dt.day
df_fixed['month'] = df_fixed['date'].dt.month
df_fixed['decade'] = df_fixed['date'].dt.year - (df_fixed['date'].dt.year %10)
df_fixed.dtypes

In [None]:
df_fixed['decade'].value_counts()