# Exploratory Data Analysis
In this notebook, I get insights into the dataset using curiosty driven questions.

In [1]:
# Import necessary libraries

import pandas as pd
import sqlite3
from lets_plot import *

LetsPlot.setup_html()
%load_ext sql
%config SqlMagic.autocommit=True


## Helper Functions
As SQLite cannot store values in the preferable formats (for example Date is stored as TEXT), I make helper functions that will be used when a new data frame is obtained from the database.

In [2]:
def datetime(value, precision):
    if precision == 'day':
        pd.to_datetime(value, format='%Y-%m-%d')
    elif precision == 'month':
        pd.to_datetime(value, format='%Y-%m')
    elif precision == 'year':
        pd.to_datetime(value, format='%Y')





In [3]:
# Connect to the database
%sql sqlite:///../data//clean/spotify_playlists.db --alias db

### Tracks' popularity & explicit content

In [4]:
%%sql

tab << SELECT is_explicit, popularity, release_date
FROM songs
LEFT JOIN song_album_map
ON songs.song_id = song_album_map.song_id
LEFT JOIN albums
ON song_album_map.album_id = albums.album_id


In [5]:
df = tab.DataFrame()

In [6]:
df['release_date'] = pd.to_datetime(df['release_date'], format = 'ISO8601')
df['is_explicit'] = df['is_explicit'].astype('category')
df.dtypes

is_explicit                        int64
popularity                         int64
release_date              datetime64[ns]
release_date_precision            object
dtype: object

In [7]:
df = df.sort_values('release_date')

In [25]:
df['is_explicit'] = df['is_explicit'].astype('category')

In [32]:
plot = ggplot(df) + \
    geom_point(aes(x='release_date', y='popularity', color='is_explicit'), alpha = 0.8) + \
    ggtitle('Popularity and Release Date with Explicit Content') + \
    ylab('Popularity') + \
    xlab('Date of Release') + \
    facet_wrap('is_explicit') + \
    scale_x_datetime() + \
    scale_color_manual(values=['black', 'green'], name='Explicit Content', labels=['No', 'Yes'])


plot.show()