In [2]:
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Replace 'your_database.db' with the actual path to your SQLite database file.
conn = sqlite3.connect('../data/main.db')

# Load data into pandas DataFrames
songs_df = pd.read_sql_query("SELECT * FROM songs", conn)
playlists_df = pd.read_sql_query("SELECT * FROM playlists", conn)
artists_df = pd.read_sql_query("SELECT * FROM artists", conn)

# Overview of the songs table
print(songs_df.info())
print(songs_df.describe())

# Overview of the playlists table
print(playlists_df.info())
print(playlists_df.describe())

# Overview of the artists table
print(artists_df.info())
print(artists_df.describe())

# Splitting artist_ids into separate rows
songs_exploded_df = songs_df.assign(artist_id=songs_df['artist_ids'].str.split(',')).explode('artist_id')

# Similarly, for playlists_top_artist_ids and playlist_top_genres
playlists_exploded_df = playlists_df.copy()
playlists_exploded_df['playlist_top_artist_ids'] = playlists_df['playlist_top_artist_ids'].str.split(',')
playlists_exploded_df['playlist_top_genres'] = playlists_df['playlist_top_genres'].str.split(',')

# Exploding the lists into multiple rows
playlists_exploded_df = playlists_exploded_df.explode('playlist_top_artist_ids')
playlists_exploded_df = playlists_exploded_df.explode('playlist_top_genres')

# Distribution of danceability in songs
sns.histplot(songs_df['danceability'], kde=True)
plt.title('Distribution of Danceability in Songs')
plt.show()

# Distribution of energy in playlists
sns.histplot(playlists_df['max_energy'], kde=True)
plt.title('Distribution of Maximum Energy in Playlists')
plt.show()

# Correlation matrix for songs
songs_corr = songs_df.corr()

# Plotting the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(songs_corr, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix for Songs')
plt.show()

# Scatter plot between energy and danceability
sns.scatterplot(data=songs_df, x='energy', y='danceability', hue='mode')
plt.title('Energy vs Danceability in Songs')
plt.show()

# Similarly, you can explore other relationships

# Number of genres per artist
artists_df['num_genres'] = artists_df['artist_genres'].apply(lambda x: len(str(x).split(',')))
sns.histplot(artists_df['num_genres'], bins=20, kde=True)
plt.title('Number of Genres per Artist')
plt.show()

# Most common genres
all_genres = artists_df['artist_genres'].str.split(',', expand=True).stack().value_counts()
top_genres = all_genres.head(20)

# Plotting the top genres
sns.barplot(x=top_genres.values, y=top_genres.index)
plt.title('Top 20 Genres')
plt.show()


# Close the connection when done
conn.close()


OperationalError: unable to open database file