In [6]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from tqdm import tqdm
import itertools
import datetime
import requests_cache

In [7]:
def get_premiere_df():
    url_list = [f"https://i-amabile.com/premier?pageID={i+1}" for i in range(40)]
    response_list = [requests.get(url) for url in tqdm(url_list)]
    soup_list = [BeautifulSoup(response.text) for response in tqdm(response_list)]
    table_list = list(
        itertools.chain.from_iterable(
            soup.select(".single table") for soup in tqdm(soup_list)
        )
    )
    df = pd.DataFrame(dict(table=table_list))
    df["date"] = df.table.apply(
        lambda x: datetime.datetime.strptime(
            x.select("td")[1].text.split("(")[0], "%Y年%m月%d日"
        )
    )
    df["title"] = df.table.apply(lambda x: x.select_one("a").text.lstrip("●"))
    df["link"] = df.table.apply(lambda x: x.select_one("a")["href"])
    df["text"] = df.table.apply(lambda x: x.text)
    df["orchestra"] = df.title.apply(lambda x: x.split("第")[0])
    orchestra_df = df.orchestra.value_counts().to_frame().reset_index().reset_index()
    orchestra_df.columns = ["orchestra_id", "orchestra", "orchestra_count"]
    df = pd.merge(df, orchestra_df)
    df = df.sort_values("date", ascending=False)[
        [
            "orchestra_id",
            "orchestra",
            "orchestra_count",
            "date",
            "title",
            "text",
            "link",
        ]
    ].reset_index(drop=True)
    return df

In [8]:
requests_cache.install_cache('amabile_cache')

In [9]:
df = get_premiere_df()

100%|██████████| 40/40 [00:00<00:00, 817.93it/s]
100%|██████████| 40/40 [00:00<00:00, 121.94it/s]
100%|██████████| 40/40 [00:00<00:00, 487.32it/s]


In [10]:
df.to_csv("amabile_premiere.csv", index=False)