# Overview

# Extract
where we put the summary of how we are extracting the data

In [1]:
import pandas as pd
from sqlalchemy import create_engine, inspect
from config import db_password

In [None]:
file_2021 = "data/spotify_top50_2021.csv"
file_2020 = "data/spotify_toptracks_2020.csv"

# Transform

The 2020 and 2021 Spotify top 50 tracks datasets were cleaned and combined into one database.

## Cleaning

* A year column was added to both dataframes. 
* In the 2021 dataframe, the 'id' column was renamed to 'rank' and the 'artist_nam'e column was renamed to 'artist'. 
* In the 2020 dataframe, the 'Unnamed: 0' was renamed to 'rank'.
* The 2020 dataframe rank column was edited so that the first place ranking is 1.

In [None]:
df_2021 = pd.read_csv(file_2021)
df_2021['year'] = 2021
df_2021.rename(columns={'id':"rank","artist_name":"artist"}, inplace=True)
df_2021.head()

In [None]:
df_2020 = pd.read_csv(file_2020)
df_2020['year'] = 2020
df_2020.rename(columns={'Unnamed: 0':"rank"}, inplace=True)
df_2020.head()

In [None]:
# start rank at 1 instead of 0
df_2020['rank'] = df_2020['rank'] + 1
df_2020.head()

## Combining Cleaned Dataframes
* The 2020 and 2021 dataframes were combined into one dataframe.

In [None]:
# song data columns:
# danceability, energy, key, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_ms

df_2021 = df_2021[['year','rank','artist','track_name','danceability', 'energy', 'key', 'loudness', 
         'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']].copy()

df_2020 = df_2020[['year','rank','artist','track_name','danceability', 'energy', 'key', 'loudness', 
         'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']].copy()

df_combined = pd.concat([df_2021, df_2020])
df_combined.head(100)

## Creating the artist dataframe
* Combined_df was filtered by unique values in the artist column.
* A new column named "artist_id" was created for a primary key.

In [None]:
artists = pd.DataFrame(df_combined['artist'].unique())
artists.rename(columns={0:'artist_name'}, inplace=True)
artists.insert(0, 'artist_id', range(1, 1 + len(artists)))
artists.head()

## Creating the tracks dataframe
* Columns from combined_df with data pertaining to each track were copied from combined_df into a new dataframe.
* Duplicate track_names were dropped.
* The dataframe was sorted alphabetically by track_name.
* A new column named track_id was added to use as a primary key.

In [None]:
tracks = df_combined[['track_name','artist','danceability', 'energy', 'key', 'loudness', 
         'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']].copy()
tracks = tracks.drop_duplicates(subset="track_name")
tracks.sort_values('track_name').head(25)

In [None]:
tracks.insert(0, 'track_id', range(1, 1 + len(tracks)))
tracks.head()

In [None]:
tm = tracks.merge(artists, left_on="artist", right_on="artist_name")
tm.head()

In [None]:
tracks = tm[['track_id','artist_id','track_name','danceability', 'energy', 'key', 'loudness', 
         'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']].copy()
tracks.head()

## Creating the chart dataframe
* The "year", "rank", "artist", and "track_name" columns from combined_df were copied into a new dataframe.


In [None]:
chart = df_combined[['year','rank','artist','track_name']]
chart.head()

In [None]:
cm=chart.merge(tracks, on='track_name')
cm = cm.merge(artists, left_on='artist',right_on='artist_name')
cm.head()

In [None]:
chart = cm[['year','rank','track_id','artist_id_y']].copy()
chart.rename(columns={'artist_id_y':'artist_id'}, inplace=True)
chart.head()

In [None]:
chart.sort_values(['year','rank'])

# Load
describe how we put the data into the database

In [2]:
# Connection to database
# Change username, password, host to match your config. Password should be imported from config.py
protocol = 'postgresql'
username = 'postgres'
password = db_password
host = 'localhost'
port = 5432
database_name = 'spotify_top50'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

In [3]:
inspector = inspect(engine)
inspector.get_table_names()

['test']

In [None]:
# Load dataframes into database
artists.to_sql(name='artists', con=engine, if_exists='append', index=False)
tracks.to_sql(name='tracks', con=engine, if_exists='append', index=False)
chart.to_sql(name='chart', con=engine, if_exists='append', index=False)

In [None]:
pd.read_sql_query('select * artists', con=engine).head() 