In [None]:
import pandas as pd
import json
import os
from dotenv import load_dotenv

load_dotenv()

project_path = '/home/me/dev/spotanalysis'


raw_data = os.path.join(project_path, 'data', 'raw')
processed_data = os.path.join(project_path,'data', 'processed')
all_streams = []


 

print("---Getting Data---")
for filename in os.listdir(raw_data):
    if filename.startswith('Streaming_History_Audio') and filename.endswith('.json'):
        file_path = os.path.join(raw_data, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                all_streams.extend(data)
                print(f"Loaded: {filename}")
            except json.JSONDecodeError:
                print(f"Error reading JSON from {filename}. Skipping.")

# Convert dictionaries to pandas Data Frame
df_history = pd.DataFrame(all_streams)
print(f"\nTotal raw streams loaded: {len(df_history)}")

drop_columns = ['ip_addr', 'conn_country', 'platform', 'incognito_mode', 'offline_timestamp']
df_history.drop(columns=drop_columns, inplace=True, errors='ignore')
print("Sensitive columns scrubbed from data in memory.")

#convert endtime column to datetime endtime
df_history['ts'] = pd.to_datetime(df_history['ts'])

#Songs under 30s don't count as a stream. 
df_history_clean = df_history[df_history['ms_played'] >= 30000].copy()
print(f"Total streams after removing skips (<30s): {len(df_history_clean)}")

#cleaning 
df_history_clean['HourOfDay'] = df_history_clean['ts'].dt.hour
df_history_clean['DayOfWeek'] = df_history_clean['ts'].dt.day_name()
df_history_clean['Date'] = df_history_clean['ts'].dt.date

#Removing repeat songs
unique_tracks = df_history_clean[['master_metadata_album_artist_name', 'master_metadata_track_name']].drop_duplicates().reset_index(drop=True)

df_history_clean.to_csv(os.path.join(processed_data,'clean_history.csv'), index=False)
unique_tracks.to_csv(os.path.join(processed_data, 'unique_tracks.csv'), index=False)

print("saved clean history!")
print(f"Saved {len(unique_tracks)} unique tracks")

---Getting Data---
Loaded: Streaming_History_Audio_2021_8.json
Loaded: Streaming_History_Audio_2022_12.json
Loaded: Streaming_History_Audio_2021-2022_10.json
Loaded: Streaming_History_Audio_2023-2024_17.json
Loaded: Streaming_History_Audio_2019_1.json
Loaded: Streaming_History_Audio_2020_4.json
Loaded: Streaming_History_Audio_2020_3.json
Loaded: Streaming_History_Audio_2021_9.json
Loaded: Streaming_History_Audio_2021_6.json
Loaded: Streaming_History_Audio_2025_22.json
Loaded: Streaming_History_Audio_2024_18.json
Loaded: Streaming_History_Audio_2021_7.json
Loaded: Streaming_History_Audio_2020-2021_5.json
Loaded: Streaming_History_Audio_2024-2025_20.json
Loaded: Streaming_History_Audio_2024_19.json
Loaded: Streaming_History_Audio_2022_11.json
Loaded: Streaming_History_Audio_2023_14.json
Loaded: Streaming_History_Audio_2023_16.json
Loaded: Streaming_History_Audio_2019-2020_2.json
Loaded: Streaming_History_Audio_2023_15.json
Loaded: Streaming_History_Audio_2015-2019_0.json
Loaded: Streamin