In [1]:
# Clean and Transform Spotify data
import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import json
import time

SpotifyExcelFile = '../Spotify/SpotifyPlaylistData_20230102.xlsx'

def cleanPlaylistData(sheetName):

  df = pd.read_excel(SpotifyExcelFile,sheet_name=sheetName)

  df = df.drop(columns=['Unnamed: 0', 'duration_ms'])

  df = df.loc[:,['track_uri','artist_uri', 'album_uri', 'track_name', 'artist_name', 'album_name', 'release_date', 'explicit', 'popularity']]
  # df['duration_ms'] = pd.to_timedelta(df['duration_ms'], unit='ms')
  df = df.reset_index(drop=True)
  
  return df

def cleanArtistData(sheetName):

  df = pd.read_excel(SpotifyExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0'])
  df = df.reset_index(drop=True)
  
  return df

def cleanAlbumData(sheetName):

  df = pd.read_excel(SpotifyExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0', 'artist_uri'])
  df = df.loc[:,['album_uri', 'album_name', 'artist_name', 'release_date', 'label', 'total_tracks', 'popularity']]
  df = df.reset_index(drop=True)
  
  return df

def cleanTrackFeaturesData(sheetName):

  df = pd.read_excel(SpotifyExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0'])
  df.rename(columns={'uri': 'track_uri'}, inplace=True)
  df = df.reset_index(drop=True)
  
  return df


def mergeDataFrames(HHDF, CHHDF):
  mergedDF = pd.concat([HHDF, CHHDF], ignore_index=True)

  return mergedDF

HHPlaylistSpotDF = cleanPlaylistData('HHPlaylist')
CHHPlaylistSpotDF = cleanPlaylistData('CHHPlaylist')
HHPlaylistSpotDF.insert(7, 'genre', 'Hip-Hop')
CHHPlaylistSpotDF.insert(7, 'genre', 'Christian Hip-Hop')

PlaylistSpotDF = mergeDataFrames(HHPlaylistSpotDF, CHHPlaylistSpotDF)

HHArtistsSpotDF = cleanArtistData('HHArtists')
CHHArtistsSpotDF = cleanArtistData('CHHArtists')

ArtistsSpotDF = mergeDataFrames(HHArtistsSpotDF, CHHArtistsSpotDF)

HHAlbumsSpotDF = cleanAlbumData('HHAlbums')
CHHAlbumsSpotDF = cleanAlbumData('CHHAlbums')

AlbumsSpotDF = mergeDataFrames(HHAlbumsSpotDF, CHHAlbumsSpotDF)

HHTrackFeaturesSpotDF = cleanTrackFeaturesData('HHTrackFeatures')
CHHTrackFeaturesSpotDF = cleanTrackFeaturesData('CHHTrackFeatures')

TrackFeaturesSpotDF = mergeDataFrames(HHTrackFeaturesSpotDF, CHHTrackFeaturesSpotDF)


AlbumsSpotDF


Unnamed: 0,album_uri,album_name,artist_name,release_date,label,total_tracks,popularity
0,spotify:album:7txGsnDSqVMoRl6RQ9XyZP,HEROES & VILLAINS,Metro Boomin,2022-12-02,Republic Records,15,93
1,spotify:album:4lXLULUDQvqV1yHtLW5fyL,Lil Durk Presents: Loyal Bros 2,Only The Family,2022-12-16,Only The Family / EMPIRE,23,71
2,spotify:album:5MS3MvWHJ3lOZPLiMxzOU6,Her Loss,Drake,2022-11-04,OVO / Republic Records,16,93
3,spotify:album:0TDPqwRP3NuPYnxm45BqPg,My All,Polo G,2022-12-09,Columbia,1,68
4,spotify:album:1POWgdYTzfFt9rhKlXFwsU,Pink Heartz,SoFaygo,2022-11-11,Cactus Jack,16,66
...,...,...,...,...,...,...,...
92,spotify:album:6i0d3hXZYlPkDxSg9THYGe,Glorious Summxr,GLO,2022-09-02,GLO,16,19
93,spotify:album:4OFtsW0U298t0iyKTjKp8I,Too Soon,Ruslan,2022-12-16,Kings Dream Ent,1,28
94,spotify:album:7xQ4UFCpC2Y1PrfnDcZWSL,DAMAGE,350,2022-12-16,Searching for Soulace,1,23
95,spotify:album:43qzlGcPYdkTtIUm9j6aKF,Time,A.I. The Anomaly,2021-11-26,977692 Records DK2,1,4


In [2]:
# Clean and Transform LastFM data
import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import json
import time

LastFMExcelFile = '../LastFM/LastFMData_20230102.xlsx'

def cleanLastFMTrackData(sheetName):

  df = pd.read_excel(LastFMExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0'])
  df['artist_mbid'] = df['artist_mbid'].replace(np.nan, None)
  # df = df.loc[:,['artist_mbid', 'track_name', 'artist_name', 'listeners', 'playcount']]
  # df = df.sort_values(by='artist_name', ascending=False)
  # df = df.sort_values(by='playcount', ascending=False)
  df = df.reset_index(drop=True)
  
  return df

def cleanLastFMArtistData(sheetName):

  df = pd.read_excel(LastFMExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0'])
  # df = df.loc[:,['artist_mbid', 'track_name', 'artist_name', 'listeners', 'playcount']]
  df = df.drop_duplicates(['artist_name'])
  df['artist_mbid'] = df['artist_mbid'].replace(np.nan, None)
  # df = df.sort_values(by='artist_name', ascending=False)
  # df = df.sort_values(by='playcount', ascending=False)
  df = df.reset_index(drop=True)
  
  return df

def cleanLastFMAlbumData(sheetName):

  df = pd.read_excel(LastFMExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0'])
  # df = df.loc[:,['artist_mbid', 'track_name', 'artist_name', 'listeners', 'playcount']]
  df = df.drop_duplicates(subset=['album_mbid', 'album_name'])
  df['album_mbid'] = df['album_mbid'].replace(np.nan, None)
  # df = df.sort_values(by='listeners', ascending=False)
  # df = df.sort_values(by='playcount', ascending=False)
  df = df.reset_index(drop=True)
  
  return df

def mergeDataFrames(HHDF, CHHDF):
  mergedDF = pd.concat([HHDF, CHHDF], ignore_index=True)

  return mergedDF

HHAlbumLastDF = cleanLastFMAlbumData('HHAlbumData')
CHHAlbumLastDF = cleanLastFMAlbumData('CHHAlbumData')
AlbumsLastDF = mergeDataFrames(HHAlbumLastDF, CHHAlbumLastDF)

HHTrackLastDF = cleanLastFMTrackData('HHTrackData')
CHHTrackLastDF = cleanLastFMTrackData('CHHTrackData')
TrackLastDF = mergeDataFrames(HHTrackLastDF, CHHTrackLastDF)

HHArtistLastDF = cleanLastFMArtistData('HHArtistData')
CHHArtistLastDF = cleanLastFMArtistData('CHHArtistData')
ArtistLastDF = mergeDataFrames(HHArtistLastDF, CHHArtistLastDF)

# HHArtistLastDF.head()
# HHAlbumLastDF.head()
# TrackLastDF.head()

AlbumsLastDF

Unnamed: 0,album_mbid,album_name,listeners,playcount
0,0ae489aa-064f-4400-8879-e66cd2743329,Heroes & Villains,368246,16373898
1,,Lil Durk Presents: Loyal Bros 2,423,2089
2,,Her Loss,496454,19636371
3,,My All,30086,160436
4,5e33f6f4-9e17-4276-a874-49d169c305e0,PINK HEARTZ,46539,878294
...,...,...,...,...
94,,Glorious Summxr,17,65
95,,Too Soon,144,475
96,,DAMAGE,100,498
97,,Time,16,46


In [3]:
# Join Spotify and LastFM data

joinedPlaylisttDF = PlaylistSpotDF.join(TrackLastDF, lsuffix='_Spotify', rsuffix='_LastFM')
joinedPlaylisttDF = joinedPlaylisttDF.drop(columns=['artist_mbid', 'artist_name_LastFM', 'track_name_LastFM', 'album_name_LastFM'])
joinedPlaylisttDF.rename(columns={'track_name_Spotify': 'track_name', 'artist_name_Spotify': 'artist_name', 'album_name_Spotify': 'album_name', 'listeners': 'listeners_LastFM', 'playcount': 'playcount_LastFM'}, inplace=True)

joinedArtistsDF = ArtistsSpotDF.join(ArtistLastDF, lsuffix='_Spotify', rsuffix='_LastFM')
joinedArtistsDF = joinedArtistsDF.drop(columns=['artist_mbid', 'artist_name_LastFM'])
joinedArtistsDF.rename(columns={'artist_name_Spotify': 'artist_name', 'listeners': 'listeners_LastFM', 'playcount': 'playcount_LastFM'}, inplace=True)

joinedAlbumsDF = AlbumsSpotDF.join(AlbumsLastDF, lsuffix='_Spotify', rsuffix='_LastFM')
joinedAlbumsDF = joinedAlbumsDF.drop(columns=['album_mbid', 'album_name_LastFM'])
joinedAlbumsDF.rename(columns={'album_name_Spotify': 'album_name', 'listeners': 'listeners_LastFM', 'playcount': 'playcount_LastFM'}, inplace=True)

joinedPlaylisttDF

Unnamed: 0,track_uri,artist_uri,album_uri,track_name,artist_name,album_name,release_date,genre,explicit,popularity,listeners_LastFM,playcount_LastFM
0,spotify:track:4WuOWVnAqvEQxgSRrspBgt,spotify:artist:0iEtIxbK0KxaSlF7G42ZOp,spotify:album:7txGsnDSqVMoRl6RQ9XyZP,Niagara Falls (Foot or 2) [with Travis Scott &...,Metro Boomin,HEROES & VILLAINS,2022-12-02,Hip-Hop,True,87,174102,1300460
1,spotify:track:2i2qDe3dnTl6maUE31FO7c,spotify:artist:3hcs9uc56yIGFCSy9leWe7,spotify:album:4lXLULUDQvqV1yHtLW5fyL,Mad Max,Lil Durk,Lil Durk Presents: Loyal Bros 2,2022-12-16,Hip-Hop,True,77,26128,79681
2,spotify:track:1bDbXMyjaUIooNwFE9wn0N,spotify:artist:3TVXtAsR1Inumwj472S9r4,spotify:album:5MS3MvWHJ3lOZPLiMxzOU6,Rich Flex,Drake,Her Loss,2022-11-04,Hip-Hop,True,96,408685,3159582
3,spotify:track:0vjeOZ3Ft5jvAi9SBFJm1j,spotify:artist:0iEtIxbK0KxaSlF7G42ZOp,spotify:album:7txGsnDSqVMoRl6RQ9XyZP,Superhero (Heroes & Villains) [with Future & C...,Metro Boomin,HEROES & VILLAINS,2022-12-02,Hip-Hop,True,89,211446,1811015
4,spotify:track:2UQ2oUbHiB8wWYCE42JX0k,spotify:artist:6AgTAQt8XS6jRWi4sX7w49,spotify:album:0TDPqwRP3NuPYnxm45BqPg,My All,Polo G,My All,2022-12-09,Hip-Hop,True,80,30383,170947
...,...,...,...,...,...,...,...,...,...,...,...,...
99,spotify:track:4r5NvWCX4QMlm8uUy0BA9V,spotify:artist:5MePjC4n30qT9Mn68OwmuO,spotify:album:6i0d3hXZYlPkDxSg9THYGe,Going,GLO,Glorious Summxr,2022-09-02,Christian Hip-Hop,False,11,2,2
100,spotify:track:6AdmmhqGzdfIPHZRafz0a7,spotify:artist:2GEXrCflKZ5S5ZHBM4LNcV,spotify:album:4OFtsW0U298t0iyKTjKp8I,Too Soon,Ruslan,Too Soon,2022-12-16,Christian Hip-Hop,False,39,144,475
101,spotify:track:4SzujBKu1UJBtbmHvjZ0ex,spotify:artist:7tLS8BRv5KP3jPwm3KdsPl,spotify:album:7xQ4UFCpC2Y1PrfnDcZWSL,DAMAGE,350,DAMAGE,2022-12-16,Christian Hip-Hop,False,35,100,498
102,spotify:track:6WcXtbpCRf5InuLiYN4V4e,spotify:artist:3PoVfuLf8nvX4HLntiLTUa,spotify:album:43qzlGcPYdkTtIUm9j6aKF,Time,A.I. The Anomaly,Time,2021-11-26,Christian Hip-Hop,False,11,19,49


In [4]:
# Push data to excel

import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import json
import datetime as datetime

xlxWriter = pd.ExcelWriter('TransformedMusicData.xlsx', engine='xlsxwriter')

joinedPlaylisttDF.to_excel(xlxWriter, sheet_name='Playlist')
joinedArtistsDF.to_excel(xlxWriter, sheet_name='Artists')
joinedAlbumsDF.to_excel(xlxWriter, sheet_name='Albums')
TrackFeaturesSpotDF.to_excel(xlxWriter, sheet_name='TrackFeatures')

xlxWriter.close()

In [5]:
# Push data to SQL Server
import pyodbc
import sqlalchemy
from sqlalchemy.engine import URL
from sqlalchemy.types import SmallInteger, Text, String, DateTime, Boolean, Integer, Float
import os
from dotenv import load_dotenv
import pandas as pd
import json

load_dotenv()

LOCAL_SERVER_NAME = os.getenv('LOCAL_SERVER_NAME')
dbName = 'TrackStarzMusicAnalysis'

connection_string = ("Driver={SQL Server Native Client 11.0};"
            f"Server={LOCAL_SERVER_NAME};"
            f"Database={dbName};"
            "Trusted_Connection=yes")
      
connection_url = URL.create("mssql+pyodbc", query={"odbc_connect": connection_string})

engine = sqlalchemy.create_engine(connection_url)

conn = engine.connect()

def createSQLTable(dataFrame, tableName, dataTypes):
  dataFrame.to_sql(tableName, 
    con=conn, 
    if_exists='replace', 
    index=False, 
    dtype=dataTypes)

createSQLTable(joinedPlaylisttDF, 'SpotifyPlaylist', {
    "track_uri": String(50),
    "artist_uri": String(50),
    "album_uri": String(50),
    "track_name": String(100),
    "artist_name": String(100),
    "album_name": String(100),
    "release_date": DateTime(),
    "explicit": SmallInteger(),
    "popularity": SmallInteger(),
    "listeners_LastFM": Integer(),
    "playcount_LastFM": Integer()
})

createSQLTable(joinedArtistsDF, 'SpotifyArtists', {
    "artist_uri": String(50),
    "artist_name": String(100),
    "popularity": SmallInteger(),
    "followers": Integer(),
    "listeners_LastFM": Integer(),
    "playcount_LastFM": Integer()
})

conn.execute('ALTER TABLE SpotifyArtists ALTER COLUMN artist_uri VARCHAR(50) NOT NULL;')
conn.execute('ALTER TABLE SpotifyArtists ADD PRIMARY KEY (artist_uri);')

createSQLTable(joinedAlbumsDF, 'SpotifyAlbums', {
    "album_uri": String(50),
    "album_name": String(100),
    "artist_name": String(100),
    "release_date": DateTime(),
    "label": String(100),
    "total_tracks": SmallInteger(),
    "popularity": SmallInteger(),
    "listeners_LastFM": Integer(),
    "playcount_LastFM": Integer()
})

conn.execute('ALTER TABLE SpotifyAlbums ALTER COLUMN album_uri VARCHAR(50) NOT NULL;')
conn.execute('ALTER TABLE SpotifyAlbums ADD PRIMARY KEY (album_uri);')

createSQLTable(TrackFeaturesSpotDF, 'SpotifyTrackFeatures', {
    "track_uri": String(50),
    "track_name": String(100),
    "duration_ms": Integer(),
    "time_signature": SmallInteger(),
    "tempo": Float(),
    "valence": Float(),
    "liveness": Float(),
    "instrumentalness": Float(),
    "acousticness": Float(),
    "speechiness": Float(),
    "mode": SmallInteger(),
    "loudness": Float(),
    "key": SmallInteger(),
    "energy": Float(),
    "danceability": Float()
})

conn.execute('ALTER TABLE SpotifyTrackFeatures ALTER COLUMN track_uri VARCHAR(50) NOT NULL;')
conn.execute('ALTER TABLE SpotifyTrackFeatures ADD PRIMARY KEY (track_uri);')


conn.execute('ALTER TABLE SpotifyPlaylist ALTER COLUMN track_uri VARCHAR(50) NOT NULL;')
conn.execute('ALTER TABLE SpotifyPlaylist ADD PRIMARY KEY (track_uri);')
conn.execute('ALTER TABLE SpotifyPlaylist ALTER COLUMN artist_uri VARCHAR(50) NOT NULL;')
conn.execute('ALTER TABLE SpotifyPlaylist ADD FOREIGN KEY (artist_uri) REFERENCES SpotifyArtists(artist_uri);')
conn.execute('ALTER TABLE SpotifyPlaylist ALTER COLUMN album_uri VARCHAR(50) NOT NULL;')
conn.execute('ALTER TABLE SpotifyPlaylist ADD FOREIGN KEY (album_uri) REFERENCES SpotifyAlbums(album_uri);')

conn.close()

Python-dotenv could not parse statement starting at line 3
Python-dotenv could not parse statement starting at line 8


In [None]:
# Read data from SQL Server
import pyodbc
import sqlalchemy
from sqlalchemy.engine import URL
import os
from dotenv import load_dotenv
import pandas as pd
import json
import datetime as datetime

load_dotenv()

LOCAL_SERVER_NAME = os.getenv('LOCAL_SERVER_NAME')
# LOCAL_USER_NAME = os.getenv('LOCAL_USER_NAME')
dbName = 'TrackStarzMusicAnalysis'

connection_string = ("Driver={SQL Server Native Client 11.0};"
            f"Server={LOCAL_SERVER_NAME};"
            f"Database={dbName};"
            "Trusted_Connection=yes")
      
connection_url = URL.create("mssql+pyodbc", query={"odbc_connect": connection_string})

engine = sqlalchemy.create_engine(connection_url)

conn = engine.connect()
# inspectDB = sqlalchemy.inspect(engine)
# print(inspectDB.get_table_names())
data = pd.read_sql_table('HHSpotifyTrackFeatures', conn)

conn.close()
data