In [None]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from azure.storage.blob import BlobServiceClient
from collections import defaultdict, Counter
import json
import re

In [None]:
connection_string = "YOUR_CONNECTION_STRING"
container_name = "CONTAINER_NAME"

blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

In [None]:
blob_name = "music_data.csv"
if container_client.exists():
    blob_client = container_client.get_blob_client(blob_name)
    
    with open('music_data.csv', "wb") as my_blob:
        blob_data = blob_client.download_blob()
        my_blob.write(blob_data.readall())
    
    df = pd.read_csv("music_data.csv")
else:
    print(f"Container '{container_name}' does not exist.")

In [None]:
blob_name = "genres.json"
blob_client = container_client.get_blob_client(blob_name)
downloaded_content = blob_client.download_blob().readall()

artist_genres = json.loads(downloaded_content.decode("utf-8"))

In [None]:
df 

In [None]:
len(artist_genres.keys())

In [None]:
strings_columns = ['artist_name','artist_id','first_song','first_album','first_album_type', 'last_song','last_album','last_album_type']
datetime_columns = ['first_added','last_added']

for string_col in strings_columns:
    df[string_col] = df[string_col].astype(str)

for datetime_col in datetime_columns:
    df[datetime_col] = pd.to_datetime(df[datetime_col])

In [None]:
df.info()

In [None]:
def find_most_common_words(strings_list):
    # Concatenate strings into a single string
    full_text = ' '.join(strings_list)

    # Use regex to tokenize the string into words
    words = re.findall(r'\b\w+\b', full_text.lower())  # Case insensitive

    # Count occurrences of each word
    word_counts = Counter(words)

    # Find the most common words
    most_common_words = word_counts.most_common(1000)  # Adjust the number as needed

    return most_common_words

In [None]:
user_genres = list(artist_genres.keys())
common_user_genres_result = find_most_common_words(user_genres)
for word, count in common_user_genres_result:
    print(f"{word}: {count}")

In [None]:
# Specify the words to be removed
# we are mainly focusing on this two since it's meant to be "r&b" together
words_to_remove = {'r', 'b'} 

# Filter out the unwanted words
filtered_result = [(word, count) for word, count in common_user_genres_result if word not in words_to_remove]
filtered_result.append(("r&b", 26)) # adding a new tuple for the r&b incident

common_user_genres_dict = defaultdict(list)
for word, count in filtered_result:   
    for genre in user_genres:
        if word in genre:
            common_user_genres_dict[word].append(genre)

In [None]:
common_user_genres_keys = sorted(common_user_genres_dict.keys())
for word in common_user_genres_keys:
    print(f"{word}: {len(common_user_genres_dict[word])} : {common_user_genres_dict[word]}")

In [None]:
years = sorted(df['first_added'].dt.year.unique())
print(years)

In [None]:
one_song_liked = df[df['amount_of_liked_songs'] == 1]
one_song_liked

In [None]:
for year in years:
    print(f"{year} : {len(one_song_liked[one_song_liked['first_added'].dt.year == year])}")