In [2]:
import pandas as pd
import numpy as np

In [103]:
podcasts = pd.read_pickle('data/xaa_podcasts.pkl')
episodes = pd.read_pickle('data/xaa_episodes.pkl')

In [53]:
def get_episodes(x: pd.Series):
    """helper function for getting episodes from data. Mainly used as a lambda function

    Args:
        x (pd.Series): row of dataframe

    Returns:
        _type_: _description_
    """
    id = x['databaseId']
    df = pd.DataFrame(x['scraped']['episodes'])
    df['podcastId'] = id
    return df

def split_data(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """split data into podcasts and episodes

    Args:
        df (pd.DataFrame): loaded df chunk

    Returns:
        tuple[pd.DataFrame, pd.DataFrame]: podcasts, episodes
    """
    podcasts = df.apply(lambda x: pd.Series(x['scraped']['meta']), axis=1)
    podcasts['id'] = df['databaseId']
    
    episodes = df.apply(get_episodes, axis=1)
    
    return podcasts, episodes
    
def filter_podcasts(podcasts: pd.DataFrame) -> pd.DataFrame:
    """filter podcasts and drop columns

    Args:
        podcasts (pd.DataFrame): podcasts dataframe

    Returns:
        pd.DataFrame: filtered podcasts dataframe
    """
    value_counts = (podcasts.isna().sum() / podcasts.shape[0]).sort_values()
    podcasts.drop(columns=value_counts[value_counts > 0.9].index, inplace=True)
    podcasts.drop(columns=['type', 'funding'], inplace=True, errors='ignore')
    podcasts['explicit'] = podcasts['explicit'].astype(bool)
    
    return podcasts

from datetime import datetime
def timezone_map(x):
    try:
        x = " ".join(x.split(' ')[:4]) # remove timestamp, keep only date
        date_format = "%a, %d %b %Y"
        return datetime.strptime(x, date_format)
    except:
        return None

def filter_episodes(episodes: pd.DataFrame) -> pd.DataFrame:
    """filter episodes and drop columns

    Args:
        episodes (pd.DataFrame): episodes dataframe

    Returns:
        pd.DataFrame: filtered episodes dataframe
    """
    value_counts = (episodes.isna().sum() / episodes.shape[0]).sort_values()
    episodes.drop(columns=value_counts[value_counts > 0.9].index, inplace=True)
    episodes.drop(columns=['funding', 'transcript', 'soundbite'], inplace=True, errors='ignore')
    episodes['explicit'] = episodes['explicit'].astype(bool)
    
    episodes['pubDate'] = episodes.pubDate.map(timezone_map)
    
    return episodes
    

In [3]:
df = pd.read_json('data/xan', lines=True)

In [22]:
podcasts = df.apply(lambda x: pd.Series(x['scraped']['meta']), axis=1)

In [27]:
podcasts['id'] = df['databaseId']

In [13]:
pd.DataFrame(df.iloc[0][0]['episodes'])

Unnamed: 0,title,description,duration,enclosure,explicit,funding,guid,imageURL,link,pubDate,soundbite,summary,transcript,value
0,"después de odio en redes sociales, Ciomar a Ca...","Hola a todos. En este episodio, David Mendoza ...",1518,"{'length': '24563220', 'type': 'audio/x-m4a', ...",False,[],5a3c6e00-33e4-4811-b7eb-d0b6c7644a32,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/david-ferrufino/episodes/des...,"Wed, 13 Oct 2021 23:50:00 GMT",[],"Hola a todos. En este episodio, David Mendoza ...",[],
1,Resumen deportivo correspondiente al 23 de nov...,"En este episodio, Fabricio Ponce nos trae la e...",401,"{'length': '6501499', 'type': 'audio/x-m4a', '...",False,[],d8368585-78a2-48b2-94e1-10ba78e70ebe,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/david-ferrufino/episodes/Res...,"Tue, 23 Nov 2021 13:32:27 GMT",[],"En este episodio, Fabricio Ponce nos trae la e...",[],
2,¿Realmente Apple hace historia con sus nuevos ...,"En este episodio, Tito Valle y David Mendoza, ...",1452,"{'length': '23491678', 'type': 'audio/x-m4a', ...",False,[],077fb77b-e75c-439b-8982-8c271c137aca,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/david-ferrufino/episodes/Rea...,"Tue, 19 Oct 2021 22:14:02 GMT",[],"En este episodio, Tito Valle y David Mendoza, ...",[],
3,Podcast deportivo 17 de enero de 2022,"En este episodio, Fabricio Ponce nos actualiza...",576,"{'length': '9321321', 'type': 'audio/x-m4a', '...",False,[],75fd6db8-bd56-4620-9d81-a9932171c95f,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/david-ferrufino/episodes/Pod...,"Tue, 18 Jan 2022 00:00:00 GMT",[],"En este episodio, Fabricio Ponce nos actualiza...",[],
4,Analizando las zedes en Honduras,Qué tal oyentes del podcast EL ANÁLISIS. En es...,2350,"{'length': '38020511', 'type': 'audio/x-m4a', ...",False,[],f7a04554-8466-40a6-8303-2e8b6be056a3,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/david-ferrufino/episodes/Ana...,"Tue, 02 Nov 2021 15:35:00 GMT",[],Qué tal oyentes del podcast EL ANÁLISIS. En es...,[],
5,"Propuestas electorales en Honduras, conócelas ...","<p>en este episodio, David Mendoza, Fabricio P...",1967,"{'length': '31826009', 'type': 'audio/x-m4a', ...",False,[],a33228b9-949e-4255-9b6b-93eaa3af7f11,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/david-ferrufino/episodes/Pro...,"Sun, 28 Nov 2021 03:07:41 GMT",[],"<p>en este episodio, David Mendoza, Fabricio P...",[],
6,EL ANÁLISIS (Trailer),,25,"{'length': '410594', 'type': 'audio/x-m4a', 'u...",False,[],a4dba4c6-f7b2-4ec5-88b8-ba7e391590b3,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/david-ferrufino/episodes/ep-...,"Sun, 03 Jan 2021 17:03:47 GMT",[],,[],
7,un resumen deportivo interesante,"<p>en este episodio, el compañero Fabricio Pon...",1529,"{'length': '24744402', 'type': 'audio/x-m4a', ...",False,[],01eda7e7-7a37-41e7-910e-4ea8185d8ab8,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/david-ferrufino/episodes/un-...,"Sat, 27 Nov 2021 17:54:38 GMT",[],"<p>en este episodio, el compañero Fabricio Pon...",[],
8,Sección deportiva correspondiente al 22 de oct...,"En este episodio, Fabricio Ponce nos hace un e...",1046,"{'length': '16932346', 'type': 'audio/x-m4a', ...",False,[],6ebe3545-a08b-4ffd-a688-b9106352c832,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/david-ferrufino/episodes/Sec...,"Sat, 23 Oct 2021 15:34:02 GMT",[],"En este episodio, Fabricio Ponce nos hace un e...",[],
9,sección deportiva correspondiente al 15 de ene...,"en este episodio, Fabricio Ponce nos habla de ...",615,"{'length': '9956668', 'type': 'audio/x-m4a', '...",False,[],7572804a-ed02-4584-9f56-3f732939e175,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/david-ferrufino/episodes/sec...,"Sat, 15 Jan 2022 18:10:00 GMT",[],"en este episodio, Fabricio Ponce nos habla de ...",[],


In [4]:
def get_episodes(x):
    id = x['databaseId']
    df = pd.DataFrame(x['scraped']['episodes'])
    df['podcastId'] = id
    return df

In [5]:
episodes = df.apply(get_episodes, axis=1)

In [7]:
pd.concat(episodes.to_list())

KeyboardInterrupt: 

In [1]:
episodes[0]

NameError: name 'episodes' is not defined

In [43]:
podcasts.describe()

Unnamed: 0,id
count,250000.0
mean,2831034.0
std,131229.7
min,2603673.0
25%,2713973.0
50%,2830568.0
75%,2951848.0
max,3048218.0


In [56]:
value_counts = (podcasts.isna().sum() / podcasts.shape[0]).sort_values()

In [61]:
podcasts.drop(columns=value_counts[value_counts > 0.9].index, inplace=True)

In [86]:
podcasts.drop(columns=['type', 'funding'], inplace=True, errors='ignore')

In [102]:
podcasts.to_pickle('data/xaa_podcasts.pkl')

In [96]:
podcasts['explicit'] = podcasts['explicit'].astype(bool)

In [47]:
value_counts = (episodes.isna().sum() / episodes.shape[0]).sort_values()

In [48]:
value_counts

explicit       0.000000
podcastId      0.000000
title          0.000009
enclosure      0.008669
pubDate        0.008950
duration       0.054645
link           0.081812
description    0.137879
imageURL       0.170745
guid           0.228511
summary        0.365771
author         0.490391
subtitle       0.647667
keywords       0.839837
dtype: float64

In [8]:
episodes.drop(columns=value_counts[value_counts > 0.9].index, inplace=True)

In [9]:
episodes['explicit'] = episodes['explicit'].astype(bool)

In [10]:
episodes.drop(columns=['funding', 'transcript', 'soundbite'], inplace=True, errors='ignore')

In [40]:
from datetime import datetime
def timezone_map(x):
    try:
        x = " ".join(x.split(' ')[:4]) # remove timestamp, keep only date
        date_format = "%a, %d %b %Y"
        return datetime.strptime(x, date_format)
    except:
        return None

In [41]:



dates = episodes.pubDate.map(timezone_map)

In [43]:
episodes['pubDate'] = dates

In [52]:
episodes.to_pickle('data/xaa_episodes.pkl')