In [226]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from wordcloud import WordCloud
import re

"Libraries to help with jupyter notebook usage"
# Increases jupyter notebook display width
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
# Helps display images in notebook
from IPython.display import Image

### Data Pre-processing
- Removing irrelevant audio features
- Remove outliers (top and bottom 10%) of each audio feature

In [224]:
import pandas as pd

combined_df = []

for i in range(10):
    file_path = f'../data/processed_data/audio_features/audio_features-{i*1000}-{(i+1)*1000 - 1}.csv'
    df = pd.read_csv(file_path)
    combined_df.append(df.drop(columns=['key', 'duration_ms', 'instrumentalness']))

combined_df = pd.concat(combined_df, ignore_index=True)

# Calculate the lower and upper quantiles (25% and 75%) for each audio feature
lower_quantile = 0.20
upper_quantile = 0.80
audio_features = combined_df.columns[1:]
quantiles = combined_df[audio_features].quantile([lower_quantile, upper_quantile])

# Filter out the outliers based on the quantiles
filtered_df = combined_df.copy()
for feature in audio_features:
    lower_threshold = quantiles.loc[lower_quantile, feature]
    upper_threshold = quantiles.loc[upper_quantile, feature]
    
    filtered_df = filtered_df[(filtered_df[feature] >= lower_threshold) & (filtered_df[feature] <= upper_threshold)]

# Data Visualization
## Scatter Plot 
Allows better visualization of how one audio feature tends to affect the rest

In [None]:
audio_features = filtered_df.columns[1:]

num_rows = 1
num_cols = len(audio_features)

subplot_size = 5
fig_row_size = subplot_size * num_cols
fig_col_size = subplot_size

audio_features_list = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo']

for af in audio_features_list:
    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(fig_row_size, fig_col_size))
    
    for i, feature in enumerate(audio_features):
        axs[i].hexbin(x=filtered_df[af], y=filtered_df[feature], alpha=1, gridsize=25, cmap='terrain')
        
        axs[i].set_xlabel(af)
        axs[i].set_ylabel(feature)
        
        # Adding linear regression line
        slope, intercept = np.polyfit(filtered_df[af], filtered_df[feature], 1)
        x_values = [filtered_df[af].min(), filtered_df[af].max()]
        y_values = [slope * x + intercept for x in x_values]
        axs[i].plot(x_values, y_values, color='red', linewidth=2)
    
    plt.tight_layout()
    plt.savefig(f"../resources/audio_feature_plots/plot_{af}.png", format="png", dpi=30)
    plt.show()

## Correlation Matrix
Overview of the correlation between audio features (note: correlation need not necessarily imply causation)

In [None]:
import numpy as np
import pandas as pd

# Assuming you have already calculated the correlation_matrix
correlation_matrix = filtered_df[audio_features_list].corr()

# Create a mask to hide the lower triangle (including the diagonal)
mask = np.tril(np.ones_like(correlation_matrix, dtype=bool))

# Plotting the correlation matrix as a heatmap, showing only the values in the upper triangle
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", center=0, mask=~mask)
plt.title("Correlation Matrix of Audio Features")
plt.savefig(f"../resources/audio_feature_plots/correlation.png", format="png", dpi=50)
plt.show()

## Genre Clustering
Clusters genres obtained from Spotify API using kmeans algorithm

In [225]:
from constants import global_genres as definitions

# Convert the preprocessed definitions to TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(definitions)

# Use K-means clustering to group similar definitions
num_clusters = 15
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(tfidf_matrix)

cluster_labels = kmeans.labels_

# Concatenate definitions for each cluster without the cluster labels
clusters = {}
for i, label in enumerate(cluster_labels):
    if label not in clusters:
        clusters[label] = []
    # Remove the [Cluster X] [X] annotations from the definitions
    clean_definition = ' '.join([word for word in definitions[i].split() if not word.startswith('[')])
    # Remove all numbers from the clean_definition using regex
    clean_definition = re.sub(r'\d+', '', clean_definition).replace(']', '')
    clusters[label].append(clean_definition)

### Cluster Based Word Cloud
Clustering genres based on definitions

In [None]:
# Generate word clouds for each cluster
for label, cluster_definitions in clusters.items():
    all_definitions = ' '.join(cluster_definitions)
    word_list = all_definitions.split()
    # covert to set to remove duplicates
    unique_words = list(set(word_list))  
    unique_definitions = ' '.join(unique_words)

    
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(unique_definitions)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Cluster {label}')
    plt.savefig(f"../resources/wordclouds/cluster_{label}.png", format="png", dpi=50)
    plt.show()

## Overall Word Cloud
Word cloud based on frequency of words in each genre

In [None]:
# Generate word clouds for each cluster and get word frequencies
word_frequencies = {}
for label, cluster_definitions in clusters.items():
    all_definitions = ' '.join(cluster_definitions)
    word_list = all_definitions.split()
    for word in word_list:
        if word in word_frequencies:
            word_frequencies[word] += 1
        else:
            word_frequencies[word] = 1

# Generate the large word cloud with overlapping clusters
wordcloud = WordCloud(width=800, height=400, background_color='white')
wordcloud.generate_from_frequencies(word_frequencies)

# Display the large word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Overall Word Cloud')
plt.savefig(f"../resources/wordclouds/overall.png", format="png", dpi=50)
plt.show()