In [None]:
# Clean and Transform Spotify data
import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import json
import time

SpotifyExcelFile = 'SpotifyPlaylistData_20221226.xlsx'

def cleanPlaylistData(sheetName):

  df = pd.read_excel(SpotifyExcelFile,sheet_name=sheetName)

  df = df.drop(columns=['Unnamed: 0', 'duration_ms'])

  df = df.loc[:,['track_uri','artist_uri', 'album_uri', 'track_name', 'artist_name', 'album_name', 'release_date', 'explicit', 'popularity']]
  # df['duration_ms'] = pd.to_timedelta(df['duration_ms'], unit='ms')
  df = df.reset_index(drop=True)
  
  return df

def cleanArtistData(sheetName):

  df = pd.read_excel(SpotifyExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0'])
  df = df.reset_index(drop=True)
  
  return df

def cleanAlbumData(sheetName):

  df = pd.read_excel(SpotifyExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0', 'artist_uri'])
  df = df.loc[:,['album_uri', 'album_name', 'artist_name', 'release_date', 'label', 'total_tracks', 'popularity']]
  df = df.reset_index(drop=True)
  
  return df

def cleanTrackFeaturesData(sheetName):

  df = pd.read_excel(SpotifyExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0'])
  df.rename(columns={'uri': 'track_uri'}, inplace=True)
  df = df.reset_index(drop=True)
  
  return df

HHPlaylistSpotDF = cleanPlaylistData('HHPlaylist')
CHHPlaylistSpotDF = cleanPlaylistData('CHHPlaylist')

HHArtistsSpotDF = cleanArtistData('HHArtists')
CHHArtistsSpotDF = cleanArtistData('CHHArtists')

HHAlbumsSpotDF = cleanAlbumData('HHAlbums')
CHHAlbumsSpotDF = cleanAlbumData('CHHAlbums')

HHTrackFeaturesSpotDF = cleanTrackFeaturesData('HHTrackFeatures')
CHHTrackFeaturesSpotDF = cleanTrackFeaturesData('CHHTrackFeatures')

# joinedPlaylistFeatures = HHPlaylistDF.set_index('track_uri').join(HHTrackFeaturesDF.set_index('track_uri'), lsuffix="_PL", rsuffix="_FT")
# joinedPlaylistFeatures.head()

HHPlaylistSpotDF.head()
# HHArtistsSpotDF.head()
# HHAlbumsSpotDF.head()
# HHTrackFeaturesSpotDF.head()

## Need to convert explicit column to sql boolean

In [None]:
# Clean and Transform LastFM data
import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import json
import time

LastFMExcelFile = 'LastFMData_20221226.xlsx'

def cleanLastFMTrackData(sheetName):

  df = pd.read_excel(LastFMExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0'])
  df['artist_mbid'] = df['artist_mbid'].replace(np.nan, None)
  # df = df.loc[:,['artist_mbid', 'track_name', 'artist_name', 'listeners', 'playcount']]
  # df = df.sort_values(by='artist_name', ascending=False)
  # df = df.sort_values(by='playcount', ascending=False)
  df = df.reset_index(drop=True)
  
  return df

def cleanLastFMArtistData(sheetName):

  df = pd.read_excel(LastFMExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0'])
  # df = df.loc[:,['artist_mbid', 'track_name', 'artist_name', 'listeners', 'playcount']]
  df = df.drop_duplicates(['artist_name'])
  df['artist_mbid'] = df['artist_mbid'].replace(np.nan, None)
  # df = df.sort_values(by='artist_name', ascending=False)
  # df = df.sort_values(by='playcount', ascending=False)
  df = df.reset_index(drop=True)
  
  return df

def cleanLastFMAlbumData(sheetName):

  df = pd.read_excel(LastFMExcelFile,sheet_name=sheetName)
  df = df.drop(columns=['Unnamed: 0'])
  # df = df.loc[:,['artist_mbid', 'track_name', 'artist_name', 'listeners', 'playcount']]
  df = df.drop_duplicates(subset=['album_mbid', 'album_name'])
  df['album_mbid'] = df['album_mbid'].replace(np.nan, None)
  # df = df.sort_values(by='listeners', ascending=False)
  # df = df.sort_values(by='playcount', ascending=False)
  df = df.reset_index(drop=True)
  
  return df

HHAlbumLastDF = cleanLastFMAlbumData('HHAlbumData')
CHHAlbumLastDF = cleanLastFMAlbumData('CHHAlbumData')

HHTrackLastDF = cleanLastFMTrackData('HHTrackData')
CHHTrackLastDF = cleanLastFMTrackData('CHHTrackData')

HHArtistLastDF = cleanLastFMArtistData('HHArtistData')
CHHArtistLastDF = cleanLastFMArtistData('CHHArtistData')

# HHArtistLastDF.head()
# HHAlbumLastDF.head()
# HHTrackLastDF.head()

In [None]:
# Push data to excel

import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import json
import datetime as datetime

def openFileToDataFrame(fileName):
    with open(fileName, 'r') as openfile:
        file = json.load(openfile)

    df = pd.DataFrame(file)
    return df

xlxWriter = pd.ExcelWriter('TransformedSpotifyData.xlsx', engine='xlsxwriter')

xlxWriter.close()


xlxWriter = pd.ExcelWriter('TransformedLastFMData.xlsx', engine='xlsxwriter')


xlxWriter.close()

In [96]:
# Push data to SQL Server
import pyodbc
import sqlalchemy
from sqlalchemy.engine import URL
from sqlalchemy.types import SmallInteger, Text, String, DateTime, Boolean
import os
from dotenv import load_dotenv
import pandas as pd
import json

load_dotenv()

LOCAL_SERVER_NAME = os.getenv('LOCAL_SERVER_NAME')
dbName = 'SQLTutorial'

connection_string = ("Driver={SQL Server Native Client 11.0};"
            f"Server={LOCAL_SERVER_NAME};"
            f"Database={dbName};"
            "Trusted_Connection=yes")
      
connection_url = URL.create("mssql+pyodbc", query={"odbc_connect": connection_string})

engine = sqlalchemy.create_engine(connection_url)

conn = engine.connect()

HHPlaylistSpotDF.to_sql('HHSpotifyPlaylist', 
  con=conn, 
  if_exists='replace', 
  index=False, 
  dtype={
    "track_uri": String(50),
    "artist_uri": String(50),
    "album_uri": String(50),
    "track_name": String(100),
    "artist_name": String(100),
    "album_name": String(100),
    "release_date": DateTime(),
    "explicit": SmallInteger(),
    "popularity": SmallInteger()
})

conn.close()

Python-dotenv could not parse statement starting at line 3
Python-dotenv could not parse statement starting at line 8


In [None]:
# Read data from SQL Server
import pyodbc
import sqlalchemy
from sqlalchemy.engine import URL
import os
from dotenv import load_dotenv
import pandas as pd
import json
import datetime as datetime

load_dotenv()

LOCAL_SERVER_NAME = os.getenv('LOCAL_SERVER_NAME')
# LOCAL_USER_NAME = os.getenv('LOCAL_USER_NAME')
dbName = 'SQLTutorial'

connection_string = ("Driver={SQL Server Native Client 11.0};"
            f"Server={LOCAL_SERVER_NAME};"
            f"Database={dbName};"
            "Trusted_Connection=yes")
      
connection_url = URL.create("mssql+pyodbc", query={"odbc_connect": connection_string})

engine = sqlalchemy.create_engine(connection_url)

conn = engine.connect()
# inspectDB = sqlalchemy.inspect(engine)
# print(inspectDB.get_table_names())
data = pd.read_sql_table('HHSpotifyPlaylist', conn)
print(data.head())

conn.close()