In [10]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from textblob import TextBlob
import re

In [11]:
CLIENT_ID = "cf66d17bc81344338b463aa9968574fe"
CLIENT_SECRET = "d2281332006e46c3b093d7a4dc222291"

def authenticate_spotify(client_id, client_secret):
    """认证Spotify API并返回Spotipy实例"""
    auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    return spotipy.Spotify(auth_manager=auth_manager)

def get_podcast_episodes(sp, podcast_id, max_episodes=50):
    """获取指定播客的剧集信息，包括描述"""
    episodes = []
    offset = 0
    while True:
        response = sp.show_episodes(podcast_id, limit=max_episodes, offset=offset)
        episodes.extend(response['items'])
        if not response['next']:
            break
        offset += max_episodes
    return episodes

def search_podcasts(sp, query, max_podcasts=50, max_results=300):
    """通过搜索关键字获取播客，包括描述，最多返回指定数量"""
    offset = 0
    podcasts = []
    while len(podcasts) < max_results:
        response = sp.search(q=query, type='show', limit=max_podcasts, offset=offset)
        podcasts.extend(response['shows']['items'])
        if not response['shows']['next']:
            break
        offset += max_podcasts
    return podcasts[:max_results]

In [12]:
def crawl_podcasts_and_episodes(client_id, client_secret, keywords, max_episodes_target=10000, episodes_per_keyword=300):
    """爬取播客及其剧集"""
    sp = authenticate_spotify(client_id, client_secret)
    all_data = []
    total_episodes = 0

    for keyword in keywords:
        print(f"正在搜索关键词: {keyword}")
        podcasts = search_podcasts(sp, query=keyword, max_podcasts=50, max_results=50)  # 每次搜索最多50个播客
        keyword_episode_count = 0  # 用于统计当前关键词的剧集数量
        
        for podcast in podcasts:
            if keyword_episode_count >= episodes_per_keyword:  # 达到关键词限制，跳出
                break
            
            podcast_id = podcast['id']
            podcast_name = podcast['name']
            podcast_description = podcast.get('description', 'N/A')  # 获取播客描述
            try:
                episodes = get_podcast_episodes(sp, podcast_id, max_episodes=min(episodes_per_keyword - keyword_episode_count, 50))  # 每次最多抓取剩余需要的剧集数量
                for episode in episodes:
                    episode_description = episode.get('description', 'N/A')  # 获取剧集描述
                    all_data.append({
                        'Podcast Name': podcast_name,
                        'Podcast ID': podcast_id,
                        'Podcast Description': podcast_description,
                        'Episode Name': episode['name'],
                        'Episode ID': episode['id'],
                        'Episode Description': episode_description,
                        'Release Date': episode['release_date'],
                        'Duration (ms)': episode['duration_ms']
                    })
                    keyword_episode_count += 1
                    total_episodes += 1
                    # 如果达到目标数量，停止抓取
                    if total_episodes >= max_episodes_target or keyword_episode_count >= episodes_per_keyword:
                        break
            except Exception as e:
                print(f"错误: {e} (Podcast: {podcast_name})")
            if total_episodes >= max_episodes_target or keyword_episode_count >= episodes_per_keyword:
                break

    print(f"爬取完成，共获取 {len(all_data)} 条剧集信息。")
    return pd.DataFrame(all_data)

def get_podcast_episodes(sp, podcast_id, max_episodes=50):
    """获取指定播客的剧集信息，包括描述"""
    episodes = []
    offset = 0
    while len(episodes) < max_episodes:
        limit = min(50, max_episodes - len(episodes))  # 每次抓取的最大数量
        response = sp.show_episodes(podcast_id, limit=limit, offset=offset)
        episodes.extend(response['items'])
        if not response['next']:
            break
        offset += limit
    return episodes[:max_episodes]  # 确保总数不超过限制


In [None]:
# 保存为CSV
# KEYWORDS = [ 'music', 'philosophy', 'psychology', 'economics']
# KEYWORDS = ['education', 'science', 'technology', 'comedy', 'history', 'health', 'business', 
#             'news', 'sports', 'culture', 'art', 'music', 'philosophy', 'psychology', 'economics']
KEYWORDS = ['Onerepublic',"blockchain","neural network","Madison"]
MAX_EPISODES_TARGET = 100
EPISODES_PER_KEYWORD = 30

# 爬取数据
df = crawl_podcasts_and_episodes(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    keywords=KEYWORDS,
    max_episodes_target=MAX_EPISODES_TARGET,
    episodes_per_keyword=EPISODES_PER_KEYWORD
)

df.to_csv('D:\\.1_assignment\\628\\spotify\\podcast_episodes_with_descriptions.csv', index=False)
print(f"数据已保存至 podcast_episodes_with_descriptions.csv")

正在搜索关键词: education
错误: 'NoneType' object has no attribute 'get' (Podcast: Wild Times: Wildlife Education)
正在搜索关键词: science
错误: 'NoneType' object has no attribute 'get' (Podcast: Science Vs)
正在搜索关键词: technology
正在搜索关键词: comedy
正在搜索关键词: history
正在搜索关键词: health
错误: 'NoneType' object has no attribute 'get' (Podcast: Dr. Berg’s Healthy Keto and Intermittent Fasting Podcast)
错误: 'NoneType' object has no attribute 'get' (Podcast: Let's Talk About Mental Health with Jeremy Godwin)
正在搜索关键词: business
正在搜索关键词: news
正在搜索关键词: sports
错误: 'NoneType' object has no attribute 'get' (Podcast: Go! My Favorite Sports Team)
正在搜索关键词: culture
错误: 'NoneType' object has no attribute 'get' (Podcast: Culture Gabfest)
正在搜索关键词: art
错误: 'NoneType' object has no attribute 'get' (Podcast: Padre José Arturo López Cornejo)
错误: 'NoneType' object has no attribute 'get' (Podcast: Art)
错误: 'NoneType' object has no attribute 'get' (Podcast: Meditate with Gurudev - The Art of Living)
正在搜索关键词: music
错误: 'NoneType' object has n

In [16]:
KEYWORDS = ['Onerepublic',"Cezanne","neural network","NBA","Rome"]
MAX_EPISODES_TARGET = 200
EPISODES_PER_KEYWORD = 40

df = crawl_podcasts_and_episodes(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    keywords=KEYWORDS,
    max_episodes_target=MAX_EPISODES_TARGET,
    episodes_per_keyword=EPISODES_PER_KEYWORD
)

df.to_csv('D:\\.1_assignment\\628\\spotify\\test_set.csv', index=False)

正在搜索关键词: Onerepublic
正在搜索关键词: Cezanne
正在搜索关键词: neural network
正在搜索关键词: NBA
正在搜索关键词: Rome
爬取完成，共获取 200 条剧集信息。


In [7]:
df["Podcast Name"].unique()

array(['Stuff You Should Know',
       'Who Smarted? - Educational Podcast for Kids',
       'Science of Reading: The Podcast', 'TED Talks Daily',
       'The Cult of Pedagogy Podcast',
       'The Science of Everything Podcast',
       'Armchair Expert with Dax Shepard',
       'The Knowledge Matters Podcast', "1000 facts you didn't know",
       'Huberman Lab', 'The Flip Side by Get Your Teach On',
       'Teach Me, Teacher', 'Drac: Educational Missions for Curious Kids',
       'Beginning Teacher Talk: A Podcast for New Elementary Teachers',
       "Let's Learn Everything!", 'SmartLess',
       'Brains On! Science podcast for kids', 'Science Friday',
       'StarTalk Radio',
       "Sean Carroll's Mindscape: Science, Society, Philosophy, Culture, Arts, and Ideas",
       'Science Magazine Podcast', 'Ologies with Alie Ward',
       'Big Picture Science', 'Short Wave', 'Unexplainable',
       'Science Weekly', 'ZOE Science & Nutrition', 'TED Tech',
       'Bloomberg Technology', 'Free

In [8]:
len(df)

10000

In [9]:


def remove_emoji_and_urls(text):
    """
    删除文本中的 emoji 表情和 URL 链接
    """
    if not isinstance(text, str):
        return text
    
    # 删除 URL 链接
    text = re.sub(r"http[s]?://\S+|www\.\S+", "", text)
    
    # 删除 Emoji 表情
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # 表情符号
        u"\U0001F300-\U0001F5FF"  # 符号 & 图标
        u"\U0001F680-\U0001F6FF"  # 运输 & 地图图标
        u"\U0001F700-\U0001F77F"  # Alchemical Symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r"", text)
    
    return text



def clean_csv(input):
  df = input

  for col in df.columns:
      df[col] = df[col].apply(remove_emoji_and_urls)
  return df


cleaned_df = clean_csv(df)

cleaned_df.to_csv("D:\\.1_assignment\\628\\spotify\\cleaned_file.csv", index=False)
