In [None]:
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

#Our helpers
from map_helpers import *
from data_helpers import *
from helpers import *

from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA

import json
import folium
from folium import plugins

#Plot data
import seaborn as sns

%matplotlib inline  

For a better vizualisation of this notebook please read it using the [notebook viewer](https://nbviewer.jupyter.org/github/cgallay/Ada/blob/master/Project/Projet_ADA_M2.ipynb)

# Introduction to the million song Dataset

According to the web site :  https://labrosa.ee.columbia.edu/millionsong "The Million Song Dataset is a freely-available collection of audio features and metadata for a million contemporary popular music tracks.
The core of the dataset is the feature analysis and metadata for one million songs, provided by The Echo Nest. The dataset does not include any audio, only the derived features."

The entire dataset size is 280 GB. We however, based our final analysis on a subset of roughly 100,000 songs. We decided not tu use spark as we could manage a third of the dataset with standard methods and was a sufficiant size for our analysis.

You will see below the first steps of our approach.

### Features description : 
The list of the 54 features present in the dataset, can be found here : https://labrosa.ee.columbia.edu/millionsong/pages/example-track-description
In order not to list them all we choose here to describe the one that seems reelevent to our project: 

 * **artist_name** 
 * **artist latitude:**	float	latitude
 * **artist longitude:** float	longitude
 * **artist_terms:** tags from The Echo Nest in float with range : [0:1]
 * **artist_terms_freq:** similar to mbtags_count but from Echo Nest in float with range : [0:1]  
 * **artist_terms_weight:** same shape as the two previous tags in float with range : [0:1]
 * **loudness:** the overall loudness of a track in decibels (dB). Loudness values in the Analyzer are averaged across an entire track and are useful for comparing relative loudness of segments and tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude).
 * **song_hotttnesss:** according to The Echo Nest, when downloaded (in December 2010) in float with range [0:1]
 * **segments_pitches** The key is a track-level attribute with range : [0:11] and corresponding to one of the 12 keys: C, C#, D, etc. up to B. If no key was detected, the value is -1. The mode is equal to 0 or 1 for “minor” or “major” and may be -1 in case of no result.
 * **segments_timbre** timbre is the quality of a musical note or sound that distinguishes different types of musical instruments, or voices. It is a complex notion also referred to as sound color, texture, or tone quality, and is derived from the shape of a segment’s spectro-temporal surface, independently of pitch and loudness.
 * **tempo:** tempo in BPM according to The Echo Nest
 * **title**
 * **year:** when this song was released, according to musicbrainz.org

# Importing Data
We proceded in two steps:

- Firtly, we loaded the dataset by batches and, for each of them, we extracted the useful features. We saved all the data in pickle formats for faster loading.

- Secondly, we loaded the corresponding pickles according to our needs.

In [None]:
#Generating all pickles (WARNING: Takes ~2 Hours)
for letter in ['A', 'B', 'C', 'D', 'E', 'F']:
    for half in [1, 2]:
        df = load_song_data(letter, half)
        df1 = select_col(filter_year(df), part=1).dropna()
        df2 = select_col(exctract_timbre_features(filter_hotness(df, 0.001)), part=2).dropna()
        save_pickle_filtered(df1, letter, part=1, half=half)
        save_pickle_filtered(df2, letter, part=2, half=half)
        del df, df1, df2

## Part 1: Discovering Genre through time

In [None]:
# loading the corresponding subdataset for the first part
df_song = merge_pickles(['A', 'B', 'C', 'D', 'E', 'F'], 2)
df_song = df_song.sort_values('song_hotttnesss', ascending=False)
df_song = df_song.reset_index()

In [None]:
df_deezer = add_previews_deezer(df_song[:30000], './pickle_data/complete_song_previews_url_Deezer_ABCDEF_0-30000.pkl')

In [None]:
df_spotify = add_previews_spotify(df_song[:30000], './pickle_data/complete_song_previews_url_spotify_ABCDEF_0-30000.pkl')

In [None]:
df = complete_previews(df_deezer, df_spotify, './pickle_data/complete_song_previews_url_completed_ABCDEF_0-30000.pkl')