In [5]:
# Analysing a dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
import glob
import os
import json

# Find all JSON files in the data directory
json_files = glob.glob(os.path.join('data', 'mpd.slice.*.json'))
print(f"Found {len(json_files)} JSON files in the data directory")

# Preview the first few file names
for i, file in enumerate(sorted(json_files)[:5]):
    print(f"{i+1}. {file}")
if len(json_files) > 5:
    print("...")

Found 1000 JSON files in the data directory
1. data\mpd.slice.0-999.json
2. data\mpd.slice.1000-1999.json
3. data\mpd.slice.10000-10999.json
4. data\mpd.slice.100000-100999.json
5. data\mpd.slice.101000-101999.json
...


In [7]:
from tqdm import tqdm

def process_json_dir_to_csv(input_dir, output_csv, chunk_size=5):
    # Get all JSON files in the directory
    json_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)
                 if f.endswith('.json')]

    if not json_files:
        print(f"No JSON files found in directory: {input_dir}")
        return

    # Sort files to ensure consistent processing order
    json_files.sort()

    # Process the first file to get the header and initialize the CSV file
    with open(json_files[0], 'r', encoding='utf-8') as f:
        data = json.load(f)
        playlists = data['playlists']

    # Create DataFrame from the first file and write it to CSV with header
    df = pd.DataFrame(playlists)
    df.to_csv(output_csv, index=False, mode='w')
    print(f"Created CSV file and wrote header with {len(df)} rows")

    # Process remaining files in chunks to manage memory
    for i in tqdm(range(1, len(json_files), chunk_size), desc="Processing files"):
        # Process a chunk of files
        chunk_playlists = []
        for j in range(i, min(i + chunk_size, len(json_files))):
            try:
                with open(json_files[j], 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    chunk_playlists.extend(data['playlists'])
            except Exception as e:
                print(f"Error processing {json_files[j]}: {str(e)}")

        # Create DataFrame from chunk and append to CSV
        if chunk_playlists:
            chunk_df = pd.DataFrame(chunk_playlists)
            chunk_df.to_csv(output_csv, index=False, mode='a', header=False)
            print(f"Appended {len(chunk_df)} rows to CSV")

            # Free memory
            del chunk_df
            del chunk_playlists

    print(f"Processing complete. Output saved to {output_csv}")

In [5]:
process_json_dir_to_csv("C:\\Users\\ANUBHAV\\Desktop\\FDS Lab\\Moody\\data", "playlists.csv")

Created CSV file and wrote header with 1000 rows


Processing files:   0%|          | 1/200 [00:02<08:12,  2.47s/it]

Appended 5000 rows to CSV


Processing files:   1%|          | 2/200 [00:04<07:58,  2.42s/it]

Appended 5000 rows to CSV


Processing files:   2%|▏         | 3/200 [00:07<07:55,  2.41s/it]

Appended 5000 rows to CSV


Processing files:   2%|▏         | 4/200 [00:09<07:51,  2.41s/it]

Appended 5000 rows to CSV


Processing files:   2%|▎         | 5/200 [00:12<07:52,  2.42s/it]

Appended 5000 rows to CSV


Processing files:   3%|▎         | 6/200 [00:14<07:48,  2.41s/it]

Appended 5000 rows to CSV


Processing files:   4%|▎         | 7/200 [00:16<07:45,  2.41s/it]

Appended 5000 rows to CSV


Processing files:   4%|▍         | 8/200 [00:19<07:40,  2.40s/it]

Appended 5000 rows to CSV


Processing files:   4%|▍         | 9/200 [00:21<07:36,  2.39s/it]

Appended 5000 rows to CSV


Processing files:   5%|▌         | 10/200 [00:24<07:35,  2.40s/it]

Appended 5000 rows to CSV


Processing files:   6%|▌         | 11/200 [00:26<07:32,  2.40s/it]

Appended 5000 rows to CSV


Processing files:   6%|▌         | 12/200 [00:28<07:29,  2.39s/it]

Appended 5000 rows to CSV


Processing files:   6%|▋         | 13/200 [00:31<07:26,  2.39s/it]

Appended 5000 rows to CSV


Processing files:   7%|▋         | 14/200 [00:33<07:22,  2.38s/it]

Appended 5000 rows to CSV


Processing files:   8%|▊         | 15/200 [00:35<07:21,  2.39s/it]

Appended 5000 rows to CSV


Processing files:   8%|▊         | 16/200 [00:38<07:20,  2.39s/it]

Appended 5000 rows to CSV


Processing files:   8%|▊         | 17/200 [00:40<07:18,  2.40s/it]

Appended 5000 rows to CSV


Processing files:   9%|▉         | 18/200 [00:43<07:15,  2.39s/it]

Appended 5000 rows to CSV


Processing files:  10%|▉         | 19/200 [00:45<07:12,  2.39s/it]

Appended 5000 rows to CSV


Processing files:  10%|█         | 20/200 [00:47<07:06,  2.37s/it]

Appended 5000 rows to CSV


Processing files:  10%|█         | 21/200 [00:50<07:02,  2.36s/it]

Appended 5000 rows to CSV


Processing files:  11%|█         | 22/200 [00:52<06:58,  2.35s/it]

Appended 5000 rows to CSV


Processing files:  12%|█▏        | 23/200 [00:54<06:53,  2.34s/it]

Appended 5000 rows to CSV


Processing files:  12%|█▏        | 24/200 [00:57<06:51,  2.34s/it]

Appended 5000 rows to CSV


Processing files:  12%|█▎        | 25/200 [00:59<06:48,  2.33s/it]

Appended 5000 rows to CSV


Processing files:  13%|█▎        | 26/200 [01:01<06:48,  2.35s/it]

Appended 5000 rows to CSV


Processing files:  14%|█▎        | 27/200 [01:04<06:48,  2.36s/it]

Appended 5000 rows to CSV


Processing files:  14%|█▍        | 28/200 [01:06<06:45,  2.36s/it]

Appended 5000 rows to CSV


Processing files:  14%|█▍        | 29/200 [01:09<06:44,  2.36s/it]

Appended 5000 rows to CSV


Processing files:  15%|█▌        | 30/200 [01:11<06:40,  2.35s/it]

Appended 5000 rows to CSV


Processing files:  16%|█▌        | 31/200 [01:13<06:39,  2.36s/it]

Appended 5000 rows to CSV


Processing files:  16%|█▌        | 32/200 [01:16<06:39,  2.38s/it]

Appended 5000 rows to CSV


Processing files:  16%|█▋        | 33/200 [01:18<06:41,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  17%|█▋        | 34/200 [01:21<06:41,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  18%|█▊        | 35/200 [01:23<06:39,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  18%|█▊        | 36/200 [01:25<06:37,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  18%|█▊        | 37/200 [01:28<06:38,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  19%|█▉        | 38/200 [01:30<06:38,  2.46s/it]

Appended 5000 rows to CSV


Processing files:  20%|█▉        | 39/200 [01:33<06:33,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  20%|██        | 40/200 [01:35<06:31,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  20%|██        | 41/200 [01:38<06:29,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  21%|██        | 42/200 [01:40<06:26,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  22%|██▏       | 43/200 [01:43<06:26,  2.46s/it]

Appended 5000 rows to CSV


Processing files:  22%|██▏       | 44/200 [01:45<06:21,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  22%|██▎       | 45/200 [01:48<06:20,  2.46s/it]

Appended 5000 rows to CSV


Processing files:  23%|██▎       | 46/200 [01:50<06:17,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  24%|██▎       | 47/200 [01:52<06:13,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  24%|██▍       | 48/200 [01:55<06:07,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  24%|██▍       | 49/200 [01:57<06:08,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  25%|██▌       | 50/200 [02:00<06:06,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  26%|██▌       | 51/200 [02:02<06:03,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  26%|██▌       | 52/200 [02:05<05:58,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  26%|██▋       | 53/200 [02:07<05:58,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  27%|██▋       | 54/200 [02:09<05:51,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  28%|██▊       | 55/200 [02:12<05:51,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  28%|██▊       | 56/200 [02:14<05:51,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  28%|██▊       | 57/200 [02:17<05:47,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  29%|██▉       | 58/200 [02:19<05:46,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  30%|██▉       | 59/200 [02:22<05:43,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  30%|███       | 60/200 [02:24<05:40,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  30%|███       | 61/200 [02:26<05:39,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  31%|███       | 62/200 [02:29<05:33,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  32%|███▏      | 63/200 [02:31<05:28,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  32%|███▏      | 64/200 [02:34<05:25,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  32%|███▎      | 65/200 [02:36<05:25,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  33%|███▎      | 66/200 [02:38<05:22,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  34%|███▎      | 67/200 [02:41<05:21,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  34%|███▍      | 68/200 [02:43<05:19,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  34%|███▍      | 69/200 [02:46<05:17,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  35%|███▌      | 70/200 [02:48<05:17,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  36%|███▌      | 71/200 [02:51<05:15,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  36%|███▌      | 72/200 [02:53<05:11,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  36%|███▋      | 73/200 [02:55<05:09,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  37%|███▋      | 74/200 [02:58<05:05,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  38%|███▊      | 75/200 [03:00<05:00,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  38%|███▊      | 76/200 [03:03<04:59,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  38%|███▊      | 77/200 [03:05<04:55,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  39%|███▉      | 78/200 [03:08<04:53,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  40%|███▉      | 79/200 [03:10<04:49,  2.39s/it]

Appended 5000 rows to CSV


Processing files:  40%|████      | 80/200 [03:12<04:45,  2.38s/it]

Appended 5000 rows to CSV


Processing files:  40%|████      | 81/200 [03:15<04:45,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  41%|████      | 82/200 [03:17<04:45,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  42%|████▏     | 83/200 [03:20<04:42,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  42%|████▏     | 84/200 [03:22<04:40,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  42%|████▎     | 85/200 [03:24<04:36,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  43%|████▎     | 86/200 [03:27<04:35,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  44%|████▎     | 87/200 [03:29<04:33,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  44%|████▍     | 88/200 [03:32<04:31,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  44%|████▍     | 89/200 [03:34<04:29,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  45%|████▌     | 90/200 [03:36<04:25,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  46%|████▌     | 91/200 [03:39<04:25,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  46%|████▌     | 92/200 [03:42<04:27,  2.48s/it]

Appended 5000 rows to CSV


Processing files:  46%|████▋     | 93/200 [03:44<04:21,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  47%|████▋     | 94/200 [03:46<04:17,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  48%|████▊     | 95/200 [03:49<04:15,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  48%|████▊     | 96/200 [03:51<04:12,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  48%|████▊     | 97/200 [03:53<04:07,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  49%|████▉     | 98/200 [03:56<04:05,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  50%|████▉     | 99/200 [03:58<04:02,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  50%|█████     | 100/200 [04:01<04:01,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  50%|█████     | 101/200 [04:03<03:58,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  51%|█████     | 102/200 [04:06<03:56,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  52%|█████▏    | 103/200 [04:08<03:53,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  52%|█████▏    | 104/200 [04:10<03:50,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  52%|█████▎    | 105/200 [04:13<03:47,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  53%|█████▎    | 106/200 [04:15<03:45,  2.39s/it]

Appended 5000 rows to CSV


Processing files:  54%|█████▎    | 107/200 [04:18<03:42,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  54%|█████▍    | 108/200 [04:20<03:41,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  55%|█████▍    | 109/200 [04:22<03:37,  2.39s/it]

Appended 5000 rows to CSV


Processing files:  55%|█████▌    | 110/200 [04:25<03:36,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  56%|█████▌    | 111/200 [04:27<03:33,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  56%|█████▌    | 112/200 [04:30<03:31,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  56%|█████▋    | 113/200 [04:32<03:28,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  57%|█████▋    | 114/200 [04:34<03:27,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  57%|█████▊    | 115/200 [04:37<03:24,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  58%|█████▊    | 116/200 [04:39<03:22,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  58%|█████▊    | 117/200 [04:42<03:20,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  59%|█████▉    | 118/200 [04:44<03:17,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  60%|█████▉    | 119/200 [04:46<03:14,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  60%|██████    | 120/200 [04:49<03:10,  2.38s/it]

Appended 5000 rows to CSV


Processing files:  60%|██████    | 121/200 [04:51<03:08,  2.38s/it]

Appended 5000 rows to CSV


Processing files:  61%|██████    | 122/200 [04:53<03:06,  2.39s/it]

Appended 5000 rows to CSV


Processing files:  62%|██████▏   | 123/200 [04:56<03:05,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  62%|██████▏   | 124/200 [04:58<03:02,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  62%|██████▎   | 125/200 [05:01<03:01,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  63%|██████▎   | 126/200 [05:03<02:58,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  64%|██████▎   | 127/200 [05:06<02:56,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  64%|██████▍   | 128/200 [05:08<02:54,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  64%|██████▍   | 129/200 [05:11<02:53,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  65%|██████▌   | 130/200 [05:13<02:51,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  66%|██████▌   | 131/200 [05:15<02:48,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  66%|██████▌   | 132/200 [05:18<02:46,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  66%|██████▋   | 133/200 [05:20<02:42,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  67%|██████▋   | 134/200 [05:23<02:39,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  68%|██████▊   | 135/200 [05:25<02:38,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  68%|██████▊   | 136/200 [05:28<02:34,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  68%|██████▊   | 137/200 [05:30<02:31,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  69%|██████▉   | 138/200 [05:32<02:28,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  70%|██████▉   | 139/200 [05:35<02:26,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  70%|███████   | 140/200 [05:37<02:25,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  70%|███████   | 141/200 [05:40<02:22,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  71%|███████   | 142/200 [05:42<02:20,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  72%|███████▏  | 143/200 [05:44<02:18,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  72%|███████▏  | 144/200 [05:47<02:16,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  72%|███████▎  | 145/200 [05:49<02:13,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  73%|███████▎  | 146/200 [05:52<02:11,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  74%|███████▎  | 147/200 [05:54<02:09,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  74%|███████▍  | 148/200 [05:57<02:06,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  74%|███████▍  | 149/200 [05:59<02:04,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  75%|███████▌  | 150/200 [06:02<02:02,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  76%|███████▌  | 151/200 [06:04<01:59,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  76%|███████▌  | 152/200 [06:06<01:56,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  76%|███████▋  | 153/200 [06:09<01:53,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  77%|███████▋  | 154/200 [06:11<01:51,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  78%|███████▊  | 155/200 [06:14<01:48,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  78%|███████▊  | 156/200 [06:16<01:46,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  78%|███████▊  | 157/200 [06:18<01:44,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  79%|███████▉  | 158/200 [06:21<01:42,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  80%|███████▉  | 159/200 [06:23<01:39,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  80%|████████  | 160/200 [06:26<01:39,  2.48s/it]

Appended 5000 rows to CSV


Processing files:  80%|████████  | 161/200 [06:29<01:37,  2.50s/it]

Appended 5000 rows to CSV


Processing files:  81%|████████  | 162/200 [06:31<01:33,  2.46s/it]

Appended 5000 rows to CSV


Processing files:  82%|████████▏ | 163/200 [06:33<01:30,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  82%|████████▏ | 164/200 [06:36<01:28,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  82%|████████▎ | 165/200 [06:38<01:25,  2.45s/it]

Appended 5000 rows to CSV


Processing files:  83%|████████▎ | 166/200 [06:41<01:22,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  84%|████████▎ | 167/200 [06:43<01:20,  2.44s/it]

Appended 5000 rows to CSV


Processing files:  84%|████████▍ | 168/200 [06:45<01:17,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  84%|████████▍ | 169/200 [06:48<01:14,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  85%|████████▌ | 170/200 [06:50<01:12,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  86%|████████▌ | 171/200 [06:53<01:10,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  86%|████████▌ | 172/200 [06:55<01:07,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  86%|████████▋ | 173/200 [06:57<01:04,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  87%|████████▋ | 174/200 [07:00<01:02,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  88%|████████▊ | 175/200 [07:02<00:59,  2.40s/it]

Appended 5000 rows to CSV


Processing files:  88%|████████▊ | 176/200 [07:05<00:58,  2.43s/it]

Appended 5000 rows to CSV


Processing files:  88%|████████▊ | 177/200 [07:07<00:55,  2.42s/it]

Appended 5000 rows to CSV


Processing files:  89%|████████▉ | 178/200 [07:10<00:52,  2.41s/it]

Appended 5000 rows to CSV


Processing files:  90%|████████▉ | 179/200 [07:12<00:51,  2.46s/it]

Appended 5000 rows to CSV


Processing files:  90%|█████████ | 180/200 [07:15<00:49,  2.50s/it]

Appended 5000 rows to CSV


Processing files:  90%|█████████ | 181/200 [07:17<00:47,  2.49s/it]

Appended 5000 rows to CSV


Processing files:  91%|█████████ | 182/200 [07:20<00:44,  2.48s/it]

Appended 5000 rows to CSV


Processing files:  92%|█████████▏| 183/200 [07:22<00:42,  2.48s/it]

Appended 5000 rows to CSV


Processing files:  92%|█████████▏| 184/200 [07:25<00:40,  2.51s/it]

Appended 5000 rows to CSV


Processing files:  92%|█████████▎| 185/200 [07:27<00:38,  2.53s/it]

Appended 5000 rows to CSV


Processing files:  93%|█████████▎| 186/200 [07:30<00:34,  2.49s/it]

Appended 5000 rows to CSV


Processing files:  94%|█████████▎| 187/200 [07:32<00:33,  2.54s/it]

Appended 5000 rows to CSV


Processing files:  94%|█████████▍| 188/200 [07:35<00:30,  2.57s/it]

Appended 5000 rows to CSV


Processing files:  94%|█████████▍| 189/200 [07:38<00:29,  2.65s/it]

Appended 5000 rows to CSV


Processing files:  95%|█████████▌| 190/200 [07:41<00:26,  2.66s/it]

Appended 5000 rows to CSV


Processing files:  96%|█████████▌| 191/200 [07:43<00:23,  2.61s/it]

Appended 5000 rows to CSV


Processing files:  96%|█████████▌| 192/200 [07:45<00:20,  2.54s/it]

Appended 5000 rows to CSV


Processing files:  96%|█████████▋| 193/200 [07:48<00:17,  2.51s/it]

Appended 5000 rows to CSV


Processing files:  97%|█████████▋| 194/200 [07:50<00:15,  2.51s/it]

Appended 5000 rows to CSV


Processing files:  98%|█████████▊| 195/200 [07:53<00:12,  2.50s/it]

Appended 5000 rows to CSV


Processing files:  98%|█████████▊| 196/200 [07:55<00:09,  2.48s/it]

Appended 5000 rows to CSV


Processing files:  98%|█████████▊| 197/200 [07:58<00:07,  2.48s/it]

Appended 5000 rows to CSV


Processing files:  99%|█████████▉| 198/200 [08:00<00:04,  2.49s/it]

Appended 5000 rows to CSV


Processing files: 100%|█████████▉| 199/200 [08:03<00:02,  2.51s/it]

Appended 5000 rows to CSV


Processing files: 100%|██████████| 200/200 [08:05<00:00,  2.43s/it]

Appended 4000 rows to CSV
Processing complete. Output saved to playlists.csv





In [10]:
chunk_size = 100000
playlists_data = pd.read_csv("playlists.csv", chunksize=chunk_size)

first_chunk = next(playlists_data)
print(first_chunk.head())

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,False,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
1,Awesome Playlist,False,1,1506556800,39,23,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_...",5,11656470,21,
2,korean,False,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,False,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
4,90s,False,4,1401667200,17,16,2,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk...",7,4335282,16,


In [11]:
# Creating a filtered csv with only the required columns
import pandas as pd
import json
from ast import literal_eval  # Safer than eval for parsing strings to lists/dicts

def filter_playlists_csv(input_csv, output_csv, chunk_size=100000):
    # Columns to extract from each track
    TRACK_COLS_NEEDED = ['artist_name', 'track_uri', 'track_name']

    first_chunk = True

    for chunk in pd.read_csv(input_csv, chunksize=chunk_size, usecols=['pid', 'tracks']):
        # Parse JSON strings in 'tracks' column to actual lists
        chunk['tracks'] = chunk['tracks'].apply(literal_eval)  # Safer than json.loads

        # Explode the tracks array into separate rows
        exploded = chunk.explode('tracks')

        # Convert track objects to columns and keep only needed fields
        tracks_df = pd.json_normalize(exploded['tracks'])[TRACK_COLS_NEEDED]

        # Combine with pid
        result = pd.concat([
            exploded[['pid']].reset_index(drop=True),
            tracks_df.reset_index(drop=True)
        ], axis=1)

        # Write to CSV
        result.to_csv(
            output_csv,
            mode='w' if first_chunk else 'a',
            header=first_chunk,
            index=False
        )

        first_chunk = False
        print(f"Processed {len(chunk)} rows from input CSV")

    print(f"Filtered data saved to {output_csv}")

In [12]:
filter_playlists_csv(input_csv="playlists.csv", output_csv="filtered_playlists.csv")

Processed 100000 rows from input CSV
Processed 100000 rows from input CSV
Processed 100000 rows from input CSV
Processed 100000 rows from input CSV
Processed 100000 rows from input CSV
Processed 100000 rows from input CSV
Processed 100000 rows from input CSV
Processed 100000 rows from input CSV
Processed 100000 rows from input CSV
Processed 100000 rows from input CSV
Filtered data saved to filtered_playlists.csv


In [16]:
filtered_playlists_data = pd.read_csv("filtered_playlists.csv", chunksize=50000)

chunk = next(filtered_playlists_data)
chunk.head()

Unnamed: 0,pid,artist_name,track_uri,track_name
0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop)
1,0,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic
2,0,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love
3,0,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,Rock Your Body
4,0,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,It Wasn't Me


In [18]:
# We have now created the users dataset which will be used for collaborative filtering
# We will now create the tracks dataset which will be used for content based filtering
tracks_data = pd.read_csv("dataset.csv")
tracks_data.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [19]:
# Creating a filtered csv with only the required columns: track_id, artists, track_name, duration_ms, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, time_signature, track_genre
def filter_tracks_csv(input_csv, output_csv, chunk_size=100000):
    # Columns to extract from each track
    TRACK_COLS_NEEDED = ['track_id', 'artists', 'track_name', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre']

    first_chunk = True

    for chunk in pd.read_csv(input_csv, chunksize=chunk_size):
        # Convert track objects to columns and keep only needed fields
        tracks_df = chunk[TRACK_COLS_NEEDED]

        # Write to CSV
        tracks_df.to_csv(
            output_csv,
            mode='w' if first_chunk else 'a',
            header=first_chunk,
            index=False
        )

        first_chunk = False
        print(f"Processed {len(chunk)} rows from input CSV")

    print(f"Filtered data saved to {output_csv}")

In [20]:
filter_tracks_csv(input_csv="dataset.csv", output_csv="filtered_tracks.csv")

Processed 100000 rows from input CSV
Processed 14000 rows from input CSV
Filtered data saved to filtered_tracks.csv


In [21]:
# Now we have created both the datasets for collaborative filtering and content based filtering
# Now we will start with content based filtering

# Data Preprocessing
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelBinarizer

# Load the dataset
tracks = pd.read_csv("filtered_tracks.csv")

# Display the first few rows of the dataset
tracks.head()

Unnamed: 0,track_id,artists,track_name,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,230666,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost - Acoustic,149610,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,210826,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Can't Help Falling In Love,201933,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,198853,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [22]:
# Feature Selection

# Selecting relevant features for content-based filtering
audio_features = ['danceability', 'energy', 'loudness', 'acousticness', 'liveness', 'valence', 'tempo', 'duration_ms']
genres = 'track_genre'

# Normalize the audio features
scaler = StandardScaler()
tracks[audio_features] = scaler.fit_transform(tracks[audio_features])

# One-hot encode the genres
genres_encoder = LabelBinarizer()
genres_encoded = genres_encoder.fit_transform(tracks[genres])

import numpy as np
# Combine the features
tracks_feature_matrix = np.hstack([tracks[audio_features], genres_encoded])

In [24]:
# Train-Validation Split
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(tracks_feature_matrix, test_size=0.2, random_state=42)

In [26]:
# Model Architecture
from tensorflow.keras import Model, layers

input_dimensions = tracks_feature_matrix.shape[1]
embedding_dimensions = 64

# Build the model
inputs = layers.Input(shape=(input_dimensions,))
x = layers.Dense(256, activation='relu')(inputs)
x = layers.Dense(128, activation='relu')(x)
embeddings = layers.Dense(embedding_dimensions, name="embedding")(x)

# Add auxiliary reconstruction head (autoencoder-style)
decoder = layers.Dense(128, activation='relu')(embeddings)
decoder = layers.Dense(256, activation='relu')(decoder)
outputs = layers.Dense(input_dimensions, activation='linear')(decoder) # Reconstructing the input features

model = Model(inputs=inputs, outputs=outputs)

ImportError: Traceback (most recent call last):
  File "C:\Users\ANUBHAV\Desktop\FDS Lab\myFDSLab\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [4]:
# load the parquet file
import pandas as pd
tracks = pd.read_parquet("tracks_with_embeddings.parquet")
tracks.head()

Unnamed: 0,track_id,artists,track_name,duration_ms,danceability,energy,key,loudness,mode,speechiness,...,emb_54,emb_55,emb_56,emb_57,emb_58,emb_59,emb_60,emb_61,emb_62,emb_63
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,0.024575,0.629244,-0.717148,1,0.300828,0,0.143,...,0.026546,0.040854,6.1e-05,-0.056485,-0.331604,-0.144195,-0.101007,0.00543,-0.018791,0.058135
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost - Acoustic,-0.730859,-0.845908,-1.88998,1,-1.784744,1,0.0763,...,0.035766,-0.133507,0.289012,-0.044654,-0.406979,-0.329948,-0.211469,0.164515,-0.114954,-0.026168
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,-0.160332,-0.742186,-1.122669,0,-0.293288,1,0.0557,...,-0.045124,0.0011,0.106933,-0.141384,-0.384567,-0.115001,-0.090348,0.184421,-0.078719,-0.084987
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Can't Help Falling In Love,-0.243214,-1.733304,-2.312994,0,-2.039252,1,0.0363,...,0.255313,-0.432985,0.3301,-0.159431,0.086129,-0.428819,-0.239598,0.131262,-0.250253,0.145362
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,-0.271919,0.29503,-0.788711,2,-0.28275,1,0.0526,...,0.109243,-0.135456,-0.046755,-0.034329,-0.177423,-0.257376,-0.107344,0.027321,-0.066966,-0.020128


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228000 entries, 0 to 227999
Data columns (total 81 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   track_id          114000 non-null  object 
 1   artists           113999 non-null  object 
 2   track_name        113999 non-null  object 
 3   duration_ms       114000 non-null  float64
 4   danceability      114000 non-null  float64
 5   energy            114000 non-null  float64
 6   key               114000 non-null  float64
 7   loudness          114000 non-null  float64
 8   mode              114000 non-null  float64
 9   speechiness       114000 non-null  float64
 10  acousticness      114000 non-null  float64
 11  instrumentalness  114000 non-null  float64
 12  liveness          114000 non-null  float64
 13  valence           114000 non-null  float64
 14  tempo             114000 non-null  float64
 15  time_signature    114000 non-null  float64
 16  track_genre       11