In [1]:
import json
import pandas as pd
import numpy as np
import random
from sklearn.cluster import KMeans
import joblib
from tqdm import tqdm
from multiprocessing import Pool, Manager, cpu_count
import os

In [2]:
# file_path = "/shared/3/projects/benlitterer/podcastData/processed/mayJune/mayJuneMetadata.jsonl"

output_path = "/shared/3/projects/bangzhao/prosodic_embeddings/podcast_prosodic_features/prosodic_features.csv"
metadata_path = "/shared/3/projects/benlitterer/podcastData/processed/mayJune/mayJuneMetadata.jsonl"
metadata_path2 = "/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthDataClean.jsonl"

In [5]:
def get_json_objects(file_path, kmeans, output_csv_path):
    all_features = []  # List to hold all DataFrames
    batch_size = 100  # Number of items before saving to CSV
    count = 0  # Counter for processed items

    with open(file_path, 'r') as file:
        total_lines = sum(1 for line in file)
        file.seek(0)  # Reset file pointer to the beginning

        for i, line in enumerate(tqdm(file, total=total_lines, desc="Processing JSON Lines")):
            try:
                json_object = json.loads(line)
                prosodic_path = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth" + json_object['potentialOutPath']
                prosodic_path2 = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/mayJuneRemaining" + json_object['potentialOutPath']
                
                try:
                    prosodic_feature = pd.read_csv(prosodic_path)
                except:
                    prosodic_feature = pd.read_csv(prosodic_path2)
                
                prosodic_feature = prosodic_feature.dropna(subset=['mfcc1_sma3', 'mfcc2_sma3', 'mfcc3_sma3', 'mfcc4_sma3', 'F0semitoneFrom27.5Hz_sma3nz', 'F1frequency_sma3nz'])
                X = prosodic_feature[['mfcc1_sma3', 'mfcc2_sma3', 'mfcc3_sma3', 'mfcc4_sma3', 'F0semitoneFrom27.5Hz_sma3nz', 'F1frequency_sma3nz']]
                prosodic_feature = prosodic_feature[['content']]
                prosodic_feature['id'] = i
                
                all_features.append((prosodic_feature, X))
                count += 1
                if count % batch_size == 0:
                    combined_features = pd.concat([item[0] for item in all_features], ignore_index=True)
                    combined_X = pd.concat([item[1] for item in all_features], ignore_index=True)
                    
                    labels = kmeans.predict(combined_X)
                    combined_features['cluster_id'] = labels
                    
                    combined_features.to_csv(output_csv_path, mode='a', header=not bool(count // batch_size), index=False)
                    all_features.clear()  # Clear the list for the next batch
            
            except Exception as e:
                # print(f"Error processing line {i}: {e}")
                continue
                
    # Process any remaining items in the list after the loop ends
    if all_features:
        combined_features = pd.concat([item[0] for item in all_features], ignore_index=True)
        combined_X = pd.concat([item[1] for item in all_features], ignore_index=True)
        labels = kmeans.predict(combined_X)
        combined_features['cluster_id'] = labels
        combined_features.to_csv(output_csv_path, mode='a', header=not bool(count // batch_size), index=False)

In [None]:
# kmeans = joblib.load('/shared/3/projects/bangzhao/prosodic_embeddings/sample/kmeans/kmeans_plusplus_5k_1000.pkl')

# get_json_objects(metadata_path, kmeans, output_path)

In [12]:
def process_line(line_info):
    line, i = line_info
    try:
        json_object = json.loads(line)
        prosodic_path = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth" + json_object['potentialOutPath']
        prosodic_path2 = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/mayJuneRemaining" + json_object['potentialOutPath']
        
        try:
            prosodic_feature = pd.read_csv(prosodic_path)
        except:
            prosodic_feature = pd.read_csv(prosodic_path2)
        
        prosodic_feature = prosodic_feature.dropna(subset=['mfcc1_sma3', 'mfcc2_sma3', 'mfcc3_sma3', 'mfcc4_sma3', 'F0semitoneFrom27.5Hz_sma3nz', 'F1frequency_sma3nz'])
        X = prosodic_feature[['mfcc1_sma3', 'mfcc2_sma3', 'mfcc3_sma3', 'mfcc4_sma3', 'F0semitoneFrom27.5Hz_sma3nz', 'F1frequency_sma3nz']]
        prosodic_feature = prosodic_feature[['content']]
        prosodic_feature['id'] = i
        
        return prosodic_feature, X
    except Exception as e:
        return None

def save_to_csv(features, kmeans, output_csv_path, header):
    combined_features = pd.concat([item[0] for item in features], ignore_index=True)
    combined_X = pd.concat([item[1] for item in features], ignore_index=True)
    
    labels = kmeans.predict(combined_X)
    combined_features['cluster_id'] = labels
    
    combined_features.to_csv(output_csv_path, mode='a', header=header, index=False)

def get_json_objects(file_path, kmeans, output_csv_path, num_cores):
    batch_size = 100
    count = 0
    
    with open(file_path, 'r') as file:
        total_lines = sum(1 for line in file)
        file.seek(0)
        
        manager = Manager()
        with Pool(processes=num_cores) as pool:
            all_features = []
            for i, result in enumerate(tqdm(pool.imap(process_line, zip(file, range(total_lines))), total=total_lines, desc="Processing JSON Lines")):
                if result:
                    all_features.append(result)
                    count += 1
                    if count % batch_size == 0:
                        save_to_csv(all_features, kmeans, output_csv_path, not bool(count // batch_size))
                        all_features.clear()
            
            if all_features:
                save_to_csv(all_features, kmeans, output_csv_path, not bool(count // batch_size))

In [14]:
# kmeans = joblib.load('/shared/3/projects/bangzhao/prosodic_embeddings/sample/kmeans/kmeans_plusplus_5k_1000.pkl')

# get_json_objects(metadata_path, kmeans, output_path, 6)

In [15]:
def process_batch(batch):
    results = []
    for i, line in batch:
        try:
            json_object = json.loads(line)
            prosodic_path = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth" + json_object['potentialOutPath']
            prosodic_path2 = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/mayJuneRemaining" + json_object['potentialOutPath']
            
            try:
                prosodic_feature = pd.read_csv(prosodic_path)
            except:
                prosodic_feature = pd.read_csv(prosodic_path2)
            
            prosodic_feature = prosodic_feature.dropna(subset=['mfcc1_sma3', 'mfcc2_sma3', 'mfcc3_sma3', 'mfcc4_sma3', 'F0semitoneFrom27.5Hz_sma3nz', 'F1frequency_sma3nz'])
            X = prosodic_feature[['mfcc1_sma3', 'mfcc2_sma3', 'mfcc3_sma3', 'mfcc4_sma3', 'F0semitoneFrom27.5Hz_sma3nz', 'F1frequency_sma3nz']]
            prosodic_feature = prosodic_feature[['content']]
            prosodic_feature['id'] = i
            
            results.append((prosodic_feature, X))
        except Exception as e:
            continue
    return results

def save_to_csv(features, kmeans, output_csv_path, header):
    combined_features = pd.concat([item[0] for item in features], ignore_index=True)
    combined_X = pd.concat([item[1] for item in features], ignore_index=True)
    
    labels = kmeans.predict(combined_X)
    combined_features['cluster_id'] = labels
    
    combined_features.to_csv(output_csv_path, mode='a', header=header, index=False)

def get_json_objects(file_path, kmeans, output_csv_path):
    batch_size = 100
    count = 0
    
    with open(file_path, 'r') as file:
        total_lines = sum(1 for line in file)
        file.seek(0)
        
        lines = list(enumerate(file))
        batches = [lines[i:i + batch_size] for i in range(0, total_lines, batch_size)]
        
        manager = Manager()
        with Pool(processes=6) as pool:
            for batch in tqdm(pool.imap(process_batch, batches), total=len(batches), desc="Processing JSON Lines"):
                if batch:
                    save_to_csv(batch, kmeans, output_csv_path, not bool(count // batch_size))
                    count += len(batch)


In [17]:
# kmeans = joblib.load('/shared/3/projects/bangzhao/prosodic_embeddings/sample/kmeans/kmeans_plusplus_5k_1000.pkl')

# get_json_objects(metadata_path, kmeans, output_path)

In [None]:
CHECKPOINT_FILE = '/shared/3/projects/bangzhao/prosodic_embeddings/podcast_prosodic_features/checkpoint.txt'

def process_batch(batch):
    results = []
    for i, line in batch:
        try:
            json_object = json.loads(line)
            prosodic_path = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth" + json_object['potentialOutPath']
            prosodic_path2 = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/mayJuneRemaining" + json_object['potentialOutPath']
            
            try:
                prosodic_feature = pd.read_csv(prosodic_path)
            except:
                prosodic_feature = pd.read_csv(prosodic_path2)
            
            prosodic_feature = prosodic_feature.dropna(subset=['mfcc1_sma3', 'mfcc2_sma3', 'mfcc3_sma3', 'mfcc4_sma3', 'F0semitoneFrom27.5Hz_sma3nz', 'F1frequency_sma3nz'])
            X = prosodic_feature[['mfcc1_sma3', 'mfcc2_sma3', 'mfcc3_sma3', 'mfcc4_sma3', 'F0semitoneFrom27.5Hz_sma3nz', 'F1frequency_sma3nz']]
            prosodic_feature = prosodic_feature[['content']]
            prosodic_feature['id'] = i
            # prosodic_feature['path'] = json_object[potentialOutPath]
            
            results.append((prosodic_feature, X))
        except Exception as e:
            continue
    return results

def save_to_csv(features, kmeans, output_csv_path, header):
    combined_features = pd.concat([item[0] for item in features], ignore_index=True)
    combined_X = pd.concat([item[1] for item in features], ignore_index=True)
    
    labels = kmeans.predict(combined_X)
    combined_features['cluster_id'] = labels
    
    combined_features.to_csv(output_csv_path, mode='a', header=header, index=False)

def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as file:
            return int(file.read().strip())
    return 0

def save_checkpoint(batch_index):
    with open(CHECKPOINT_FILE, 'w') as file:
        file.write(str(batch_index))

def get_json_objects(file_path, kmeans, output_csv_path):
    batch_size = 200
    count = 0
    start_batch = load_checkpoint()
    
    with open(file_path, 'r') as file:
        total_lines = sum(1 for line in file)
        file.seek(0)
        
        lines = list(enumerate(file))
        batches = [lines[i:i + batch_size] for i in range(0, total_lines, batch_size)]
        
        manager = Manager()
        with Pool(processes=6) as pool:
            for batch_index, batch in enumerate(tqdm(pool.imap(process_batch, batches), total=len(batches), desc="Processing JSON Lines")):
                if batch_index < start_batch:
                    continue  # Skip batches that have already been processed
                if batch:
                    save_to_csv(batch, kmeans, output_csv_path, not bool(count // batch_size))
                    count += len(batch)
                    save_checkpoint(batch_index + 1)

In [24]:
kmeans = joblib.load('/shared/3/projects/bangzhao/prosodic_embeddings/sample/kmeans/kmeans_plusplus_5k_1000.pkl')

get_json_objects(metadata_path, kmeans, output_path)

Processing JSON Lines: 100%|████████████████████████████████████████████████████| 16436/16436 [8:02:24<00:00,  1.76s/it]


## id path mapping

In [4]:
def id_outpath_map(file_path):
    mapping = {}
    with open(file_path, 'r') as file:
        total_lines = sum(1 for line in file)
        file.seek(0)  # Reset file pointer to the beginning
        
        for i, line in enumerate(tqdm(file, total=total_lines, desc="Processing JSON Lines")):
            try:
                json_object = json.loads(line)
                mapping[i] = json_object['potentialOutPath']
            except Exception as e:
                # Optionally, handle the error (e.g., log it)
                continue
    return mapping

In [6]:
mapping = id_outpath_map(metadata_path)

Processing JSON Lines: 100%|███████████████████████████████████████████████| 1643516/1643516 [01:01<00:00, 26657.73it/s]


In [7]:
mapping.to_csv('/shared/3/projects/bangzhao/prosodic_embeddings/podcast_prosodic_features/id_outpath_map.csv', index=False)

AttributeError: 'dict' object has no attribute 'to_csv'