# Projeto - NeoEvolution - Exploração de base de dados Kaggle.

Grupo 03:
- Alexsander Vieira
- Bruno Crestani
- Vitor Hugo Gomes
- Mariane Scheffer Nazaro

Importando as bibliotecas

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Importando o csv 

In [None]:
#This dataset contains audio statistics of the top 2000 tracks on Spotify from 2000-2019.
#The data contains about 18 columns each describing the track and it's qualities.
df = pd.read_csv('../neo-evolution-project-group-3/songs_normalize.csv')

#Exploração dos dados

In [None]:
df.head()

In [None]:
df.tail()

# Descrição das colunas do DataFrame

duration_ms: Duration of the track in milliseconds 
   
explicit: Explicit content  
  
year: Release Year of the track
    
popularity: The higher the value the more popular the song is 
   
danceability: A value of 0.0 is least danceable and 1.0 is most danceable  
   
energy: Represents a perceptual measure of intensity and activity  
  
key: The key the track is in  
  
loudness: The overall loudness of a track in decibels (dB)  
  
mode: The modality (major or minor) of a track  
  
speechiness: Speechiness detects the presence of spoken words in a track  
  
acousticness: A confidence measure from 0.0 to 1.0 of whether the track is acoustic  
  
instrumentalness: Predicts whether a track contains no vocals  
  
liveness: Detects the presence of an audience in the recording  
  
valence: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track  
  
tempo: The overall estimated tempo of a track in beats per minute (BPM) 
   
genre: Genre of the track  

In [None]:
df.info()

In [None]:
df.describe()

#Limpeza dos dados

In [None]:
df.isnull().sum()

In [None]:
df.duplicated()

In [None]:

df[df.duplicated(subset=['artist', 'song', 'year'], keep=False)]

In [None]:
df_nao_dup=df.drop_duplicates(subset=['artist', 'song', 'year'], keep='first')
df_nao_dup

In [None]:
df_nao_dup['genre'].value_counts()

In [None]:
df_set_true = df_nao_dup[df_nao_dup['genre'] == 'set()']
df_clean = df_nao_dup.drop(df_nao_dup[df_nao_dup['genre'] == 'set()'].index, inplace=False)
df_clean

In [None]:
#Retirar os anos da base com amostras muito baixas
df_1998 = df_clean[df_clean['year'] == 1998]
df_clean1998 = df_clean.drop(df_clean[df_clean['year'] == 1998].index, inplace=False)
df_clean1998['year'].value_counts()

df_1999 = df_clean1998[df_clean1998['year'] == 1999]
df_clean1999 = df_clean1998.drop(df_clean1998[df_clean1998['year'] == 1999].index, inplace=False)
df_clean1999['year'].value_counts()

df_2020 = df_clean1999[df_clean1999['year'] == 2020]
df_clean2020 = df_clean1999.drop(df_clean1999[df_clean1999['year'] == 2020].index, inplace=False)
df_clean2020['year'].value_counts()

In [None]:
#normalizar
df_normalized = df_clean2020

# H1: Há correlações da popularidade com alguma caracterísca específica das músicas?

In [None]:
sns.pairplot(df_normalized[['danceability', 'energy','loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'popularity']])

In [None]:
df_normalized.corr()

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(df_normalized.corr(), annot=True, cmap="YlGnBu", annot_kws={"size":10})
plt.title('Correlação entre as variáveis numéricas', size = 20)

In [None]:
sns.pairplot(df_normalized[['danceability', 'valence']])

In [None]:
genre = df_normalized['genre'].value_counts()

print(genre)

# H2 A duração média das músicas reduziram ao longo do tempo?

In [None]:
from matplotlib.ticker import MaxNLocator

sns.lmplot(data = df_normalized, x = 'year', y = 'duration_ms',line_kws={'color': 'red'})
ax = plt.gca()
ax.set_title("Duração da música com o passar dos anos", fontsize=18)
ax.set_xlabel("Anos")
ax.set_ylabel("Duração da música (ms)")
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.show()

# H3 Os artistas mais populares possuem consistência no topo?


In [None]:
artist = df_normalized['artist'].value_counts()
print(artist)

In [None]:
top_artists = artist.head(5).index
print(top_artists)

top_genre = genre.head(5).index
print(top_genre)

In [None]:
song_by_year = df_normalized.groupby(['year']).size()
song_by_year


In [None]:
song_by_year.plot(kind = 'bar')

In [None]:
artist_popularity = df_normalized.groupby('artist')[['artist','popularity']].sum().sort_values('popularity',ascending=False)
artist_popularity[:5]

In [None]:
top_10_artists = df_normalized['artist'].value_counts().head(10).index
print(top_10_artists)

list_top_10_artists= df_normalized[df_normalized['artist'].isin(top_10_artists)]
list_top_10_artists

In [None]:
sns.lmplot(data=list_top_5_artists, x='year', y='popularity', hue='artist')

In [None]:
for i in ['popularity']:
    g = sns.FacetGrid(list_top_5_artists, col="artist")
    g.map_dataframe(sns.lineplot, x='year', y=i)

In [None]:
for i in ['popularity']:
    g = sns.FacetGrid(list_top_10_artists, col="artist")
    g.map_dataframe(sns.lineplot, x='year', y=i)