# Exploratory Data Analysis
In this notebook, I get insights into the dataset using curiosty driven questions.

In [1]:
# Import necessary libraries

import pandas as pd
import sqlite3
from lets_plot import *

LetsPlot.setup_html()
%load_ext sql
%config SqlMagic.autocommit=True


## Helper Functions
As SQLite cannot store values in the preferable formats (for example Date is stored as TEXT), I make helper functions that will be used when a new data frame is obtained from the database.

In [2]:
# Connect to the database
%sql sqlite:///../data//clean/spotify_playlists.db --alias db

### Tracks' popularity & explicit content

In [3]:
%%sql

tab << SELECT is_explicit, popularity, release_date, title, album_name
FROM songs
LEFT JOIN song_album_map
ON songs.song_id = song_album_map.song_id
LEFT JOIN albums
ON song_album_map.album_id = albums.album_id


In [4]:
df = tab.DataFrame()

In [5]:
df['release_date'] = pd.to_datetime(df['release_date'], format = 'ISO8601')
df['is_explicit'] = df['is_explicit'].astype('category')
df.dtypes

is_explicit           category
popularity               int64
release_date    datetime64[ns]
title                   object
album_name              object
dtype: object

In [6]:
df = df.sort_values('release_date')

In [7]:
# Categorical type resulted in incorrectly formated plots so I changed the type to str
df['is_explicit'] = df['is_explicit'].astype(str)

# The Story
Let's inspect how popular each playlists is.

In [34]:
%sql pop << SELECT * FROM playlists

pop = pop.DataFrame()
pop = pop.sort_values('num_followers')
p1 = ggplot(pop, aes(x='name', y='num_followers')) + geom_linerange() + coord_flip() + scale_x_log10()
p1.show()

## First plot
group by decades

In [17]:
plot = ggplot(df) + \
    geom_point(aes(x='release_date', y='popularity', color='is_explicit', tooltip='name' ), alpha = 0.6) + \
    ggtitle('Songs\' popularity and Release Date with Explicit Content') + \
    ylab('Popularity') + \
    xlab('Date of Release') + \
    scale_x_datetime() + \
    scale_color_manual(values=['black', 'green'], name='Explicit Content', labels=['No', 'Yes'])

plot.show()

## Second Plot
