In [63]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Installations

In [64]:
!pip install ipython-autotime
%load_ext autotime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, ipython-autotime
Successfully installed ipython-autotime-0.3.1 jedi-0.18.2
time: 1.76 ms (started: 2023-03-03 11:02:11 +00:00)


In [65]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.0-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 KB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.0
time: 6.06 s (started: 2023-03-03 11:02:11 +00:00)


In [66]:
!pip install --upgrade openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
time: 5.45 s (started: 2023-03-03 11:02:17 +00:00)


In [67]:
!pip install tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tiktoken
  Downloading tiktoken-0.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.26.0
  Downloading requests-2.28.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 KB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting blobfile>=2
  Downloading blobfile-2.0.1-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodomex~=3.8
  Downloading pycryptodomex-3.17-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Installing co

time: 12.9 s (started: 2023-03-03 11:02:23 +00:00)


## Imports

In [68]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from tqdm import tqdm
import glob
import gzip
import shutil
import re
import tiktoken
from dateutil.relativedelta import relativedelta
from dateutil.parser import parse
from urllib.parse import unquote
import pytz
import datetime
import openai
import json

time: 470 ms (started: 2023-03-03 11:02:36 +00:00)


In [69]:
with open('/content/drive/MyDrive/AdamJonas/ADAM_OPENAI_API_KEY.json') as f:
    data = json.load(f)
    
openai.api_key = data["API_KEY"]
os.environ["OPENAI_API_KEY"] = data["API_KEY"]

time: 678 ms (started: 2023-03-03 11:02:36 +00:00)


## Config

In [92]:
# configs to get token length
tokenizer = tiktoken.get_encoding("cl100k_base")

# if set to True, it will use chatgpt model ("gpt-4-1106-preview") for all the completions
CHATGPT = True

# COMPLETION_MODEL - only applicable if CHATGPT is set to False
COMPLETION_MODEL = "text-davinci-003" # "text-ada-001" 


CURRENT_TIME = datetime.datetime.now(datetime.timezone.utc)
print(f"Current time: {CURRENT_TIME}")
CURRENT_TIMESTAMP = str(CURRENT_TIME.timestamp()).replace(".", "_")

Current time: 2023-03-03 11:43:25.412200+00:00
time: 4.12 ms (started: 2023-03-03 11:43:25 +00:00)


## Utils

In [93]:
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","", s)
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.replace("#","")
    s = s.strip()
    return s


def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.
    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True
    except ValueError:
        return False

def preprocess_email(email_body):
    email_body = email_body.split("-------------- next part --------------")[0]
    email_lines = email_body.split('\n')
    temp_ = []
    for line in email_lines:
        if line.startswith("On"):
            line = line.replace("-", " ")
            x = re.sub('\d', ' ', line)
            if is_date(x, fuzzy=True):
                continue
            if line.endswith("> wrote:"):
                continue
        if line.endswith("> wrote:"):
                continue
        if line.startswith("Le "):
            continue
        if line.endswith("?crit :"):
            continue
        if line and not line.startswith('>'):
            if line.startswith('-- ') or line.startswith('[') or line.startswith('_____'):
                continue
            temp_.append(line)
    email_string = "\n".join(temp_)
    normalized_email_string = normalize_text(email_string)
    return normalized_email_string


def scrape_email_data(url_):
    r = requests.get(url_)
    body_soup = BeautifulSoup(r.content, 'html.parser').body
    subject = body_soup.find('h1').text
    author = body_soup.find('b').text
    timestamp = body_soup.find('i').text
    timestamp = parse(str(timestamp), fuzzy=True)
    timestamp = timestamp.astimezone(pytz.utc).strftime('%Y-%m-%d %H:%M:%S')
    email_body = body_soup.find('pre').text
    normalized_email_body = preprocess_email(email_body)
    return author, timestamp, normalize_text(subject), normalized_email_body

def get_past_week_data(dataframe, current_time):
    dt_now = current_time
    dt_min = dt_now - datetime.timedelta(days=7)
    dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'], utc=True)
    sliced_df = dataframe[(dataframe['timestamp'] >= dt_min) & (dataframe['timestamp'] <= dt_now)]
    sliced_df.dropna(inplace=True)
    sliced_df.reset_index(drop=True, inplace=True)
    return sliced_df
    
def get_datetime_format(dataframe):
    date_list = []
    for i, r in dataframe.iterrows():
        date_string = str(r['date'])
        date_string = date_string.replace("?", " ").strip()
        date_list.append(date_string)
    dataframe['date'] = date_list
    dataframe['date'] = pd.to_datetime(dataframe['date'], utc=True)
    dataframe['date'] = pd.to_datetime(dataframe['date'], format='%Y-%m-%d %H:%M:%S', utc=True)
    dataframe['date'] = dataframe['date'].dt.strftime('%Y-%m-%d %H:%M:%S')
    return dataframe

time: 15.7 ms (started: 2023-03-03 11:43:30 +00:00)


### GPT-3

In [94]:
def generate_summary(prompt):
    summarization_prompt = f"Generate a detailed summary from below context factfully without missing any important information. Do not use the word 'summary' in it.\n\n CONTEXT:\n\n{prompt}"
    response = openai.Completion.create(
        model=COMPLETION_MODEL,
        prompt=summarization_prompt,
        temperature=0.7,
        max_tokens=1000,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=1
        )
    response_str = response["choices"][0]["text"].replace("\n", "").strip()
    return response_str

def consolidate_summary(prompt):
    consolidate_prompt = f"Consolidate below context.\n\n CONTEXT:\n\n{prompt}"
    response = openai.Completion.create(
        model=COMPLETION_MODEL,
        prompt=consolidate_prompt,
        temperature=0.7,
        max_tokens=1000,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=1
        )
    response_str = response["choices"][0]["text"].replace("\n", "").strip()
    return response_str

def generate_title(prompt):
    title_generation_prompt = f"Generate an appropriate title for below context.\n\n CONTEXT:\n\n{prompt}"
    response = openai.Completion.create(
        model=COMPLETION_MODEL,
        prompt=title_generation_prompt,
        temperature=0.7,
        max_tokens=30,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=1
        )
    response_str = response["choices"][0]["text"].replace("\n", "").strip()
    return response_str

time: 4.3 ms (started: 2023-03-03 11:43:35 +00:00)


### ChatGPT

In [95]:
def generate_chatgpt_summary(prompt):
    summarization_prompt = f"Generate a detailed summary from below context factfully without missing any important information. Do not use the word 'summary' in it.\n\n CONTEXT:\n\n{prompt}"
    response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=[
                {"role": "system", "content": "You are an intelligent assistant."},
                {"role": "user", "content": f"{summarization_prompt}"},
            ],
        temperature=0.7,
        max_tokens=1000,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=1
        )
    response_str = response['choices'][0]['message']['content'].replace("\n", "").strip()
    return response_str

def consolidate_chatgpt_summary(prompt):
    consolidate_prompt = f"Consolidate below context.\n\n CONTEXT:\n\n{prompt}"
    response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=[
                {"role": "system", "content": "You are an intelligent assistant."},
                {"role": "user", "content": f"{consolidate_prompt}"},
            ],
        temperature=0.7,
        max_tokens=1000,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=1
        )
    response_str = response['choices'][0]['message']['content'].replace("\n", "").strip()
    return response_str

def generate_chatgpt_title(prompt):
    title_generation_prompt = f"Generate an appropriate title for below context.\n\n CONTEXT:\n\n{prompt}"
    response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=[
                {"role": "system", "content": "You are an intelligent assistant."},
                {"role": "user", "content": f"{title_generation_prompt}"},
            ],
        temperature=0.7,
        max_tokens=1000,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=1
        )
    response_str = response['choices'][0]['message']['content'].replace("\n", "").strip()
    return response_str

time: 8.54 ms (started: 2023-03-03 11:43:36 +00:00)


## Data Collection

In [96]:
def collect_email_urls(base_url):
    urls_list = []
    # add current month
    month_route = f"{CURRENT_TIME.strftime('%Y-%B')}"
    email_thread_url = f"{base_url}/{month_route}/"
    urls_list.append(email_thread_url)

    # if current month is not past 7 days, add previous month as well
    if CURRENT_TIME.day < 7:
        prev_month = (CURRENT_TIME - relativedelta(months=1)).strftime('%Y-%B')
        email_thread_url = f"{base_url}/{prev_month}/"
        urls_list.append(email_thread_url)

    all_email_urls = []
    for base_url in urls_list:
        print(f"working on: {base_url}")
        scrape_url = "date.html"
        r = requests.get(base_url + scrape_url)
        soup = BeautifulSoup(r.content, 'html.parser')
        if soup.body:
            ul_soup = soup.body.findAll('ul')[1]
            li_rows = ul_soup.findAll('li')

            # get all emails urls
            email_urls = [base_url + str(i.a['href']).strip() for i in li_rows]
            all_email_urls.extend(email_urls)

    print(f"Fetched Urls: {len(all_email_urls)}")
    return all_email_urls



def scrape_email_urls(email_urls_list):
    df_list = []
    for i in tqdm(email_urls_list):
        auth_, timestamp_, sub_, email_ = scrape_email_data(i)
        df_dict = {
            "timestamp": timestamp_,
            "author": auth_,
            "subject": sub_,
            "email": email_,
            "email_url": i,
        }
        df_list.append(df_dict)
    # data frame of all emails
    emails_df = pd.DataFrame(df_list)

    # filter dataframe to get last week's data only
    df_week = get_past_week_data(emails_df, CURRENT_TIME)
    df_week['tokens'] = df_week['email'].apply(lambda x: len(tokenizer.encode(x)))

    os.makedirs("output", exist_ok=True)
    df_week.to_csv(f"output/df_week_{CURRENT_TIMESTAMP}.csv", index=False)
    return df_week

time: 5.26 ms (started: 2023-03-03 11:43:38 +00:00)


In [75]:
all_email_urls = collect_email_urls("https://lists.linuxfoundation.org/pipermail/bitcoin-dev")

working on: https://lists.linuxfoundation.org/pipermail/bitcoin-dev/2023-March/
working on: https://lists.linuxfoundation.org/pipermail/bitcoin-dev/2023-February/
Fetched Urls: 136
time: 1.06 s (started: 2023-03-03 11:02:41 +00:00)


In [76]:
df_week = scrape_email_urls(all_email_urls)

100%|██████████| 136/136 [01:02<00:00,  2.19it/s]

time: 1min 2s (started: 2023-03-03 11:02:42 +00:00)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_week['tokens'] = df_week['email'].apply(lambda x: len(tokenizer.encode(x)))


In [77]:
df_week

Unnamed: 0,timestamp,author,subject,email,email_url,tokens
0,2023-03-01 15:05:47+00:00,Greg Sanders,[bitcoin-dev] BIP for OP_VAULT,"Hello James, First off, thank you for crafting...",https://lists.linuxfoundation.org/pipermail/bi...,1044
1,2023-03-01 17:17:58+00:00,David A. Harding,[bitcoin-dev] BIP proposal: Fee-redistribution...,"On 2023 02 27 03:32, Rastislav Budinsky via bi...",https://lists.linuxfoundation.org/pipermail/bi...,540
2,2023-03-01 20:18:22+00:00,Giuseppe B,[bitcoin-dev] Minimum fees,"Hello everyone, I'm relatively new here so wha...",https://lists.linuxfoundation.org/pipermail/bi...,402
3,2023-03-02 00:39:13+00:00,Nadav Ivgi,[bitcoin-dev] Minimum fees,"Hi Giuseppe, One side effect this has is that ...",https://lists.linuxfoundation.org/pipermail/bi...,148
4,2023-03-02 04:46:25+00:00,Anthony Towns,[bitcoin-dev] BIP for OP_VAULT,I like this! I tried to come up with something...,https://lists.linuxfoundation.org/pipermail/bi...,702
5,2023-03-02 06:55:19+00:00,kcalvinalvin,[bitcoin-dev] Using service bit 24 for utreexo...,"Hello all, Wanted to tell the mailing list tha...",https://lists.linuxfoundation.org/pipermail/bi...,174
6,2023-03-02 14:54:31+00:00,Greg Sanders,[bitcoin-dev] BIP for OP_VAULT,"Greetings AJ, Glad I could resurrect the idea!...",https://lists.linuxfoundation.org/pipermail/bi...,518
7,2023-03-02 18:20:35+00:00,Luke Dashjr,[bitcoin-dev] Using service bit 24 for utreexo...,This sounds like something that should be writ...,https://lists.linuxfoundation.org/pipermail/bi...,44
8,2023-03-02 19:51:17+00:00,Andrew Melnychuk Oseen,[bitcoin-dev] BIP for OP_VAULT,I read the draft and this seems to have some f...,https://lists.linuxfoundation.org/pipermail/bi...,314
9,2023-03-02 21:05:26+00:00,Peter Todd,[bitcoin-dev] Using service bit 24 for utreexo...,The purpose of the experimental service bits i...,https://lists.linuxfoundation.org/pipermail/bi...,129


time: 41 ms (started: 2023-03-03 11:03:44 +00:00)


## Data Generation

In [97]:
def get_email_thread_data(sub_df):
    sub_df.sort_values(by='timestamp', ascending=True, inplace=True)
    sub_df.dropna(inplace=True)
    sub_df.reset_index(drop=True, inplace=True)

    first_post_date = ""
    subject = ""
    num_of_replies = sub_df.shape[0]
    author = []
    urls = []
    generated_summary = []
    consolidated_summary = ""
    consolidated_title = ""

    for i, r in tqdm(sub_df.iterrows(), total=sub_df.shape[0]):
        if i == 0:
            first_post_date += (r.timestamp).strftime('%Y-%m-%d %H:%M:%S')
            subject += r.subject
        email_text = r.email
        auth = r.author
        url = r.email_url

        if CHATGPT:
            summary_ = generate_chatgpt_summary(email_text)
        else:
            summary_ = generate_summary(email_text)

        author.append(auth)
        urls.append(url)
        generated_summary.append(summary_)
    
    # consolidated summary
    summary_concat = "\n".join(generated_summary)

    if CHATGPT:
        consolidated_summary += consolidate_chatgpt_summary(summary_concat)
        consolidated_title += generate_chatgpt_title(summary_concat)
    else:
        consolidated_summary += consolidate_summary(summary_concat)
        consolidated_title += generate_title(summary_concat)
    

    data_dict = {
        "date": first_post_date,
        "subject": subject,
        "num_replies": num_of_replies,
        "authors": author,
        "urls": urls,
        "generated_summaries": generated_summary,
        "consolidated_title": consolidated_title,
        "consolidated_summary": consolidated_summary
    }
    return data_dict

time: 3.72 ms (started: 2023-03-03 11:43:43 +00:00)


In [98]:
def generate_newsletter_completion(df):
    grouped_df = df_week.groupby('subject')
    print(f"Number of threads found: {len(grouped_df)}")
    print("-----")

    data_records = []
    for index, sub_df in grouped_df:
        print(f"working on subject: {index}")
        data_dict = get_email_thread_data(sub_df)
        data_records.append(data_dict)
        print("-----")

    df_week_generated = pd.DataFrame(data_records)
    os.makedirs("output", exist_ok=True)
    df_week_generated.to_csv(f"output/df_week_generated_{CURRENT_TIMESTAMP}.csv", index=False)
    return df_week_generated

time: 4.46 ms (started: 2023-03-03 11:43:44 +00:00)


In [80]:
df_week_generated = generate_newsletter_completion(df_week)

Number of threads found: 5
-----
working on subject: [bitcoin-dev] BIP for OP_VAULT


100%|██████████| 4/4 [00:18<00:00,  4.65s/it]


-----
working on subject: [bitcoin-dev] BIP proposal: Fee-redistribution contracts


100%|██████████| 4/4 [00:12<00:00,  3.01s/it]


-----
working on subject: [bitcoin-dev] Minimum fees


100%|██████████| 3/3 [00:09<00:00,  3.12s/it]


-----
working on subject: [bitcoin-dev] Refreshed BIP324


100%|██████████| 2/2 [00:03<00:00,  1.82s/it]


-----
working on subject: [bitcoin-dev] Using service bit 24 for utreexo signaling in testnet and signet


100%|██████████| 3/3 [00:05<00:00,  1.72s/it]


-----
time: 1min 13s (started: 2023-03-03 11:03:44 +00:00)


In [81]:
df_week_generated

Unnamed: 0,date,subject,num_replies,authors,urls,generated_summaries,consolidated_title,consolidated_summary
0,2023-03-01 15:05:47,[bitcoin-dev] BIP for OP_VAULT,4,"[Greg Sanders, Anthony Towns, Greg Sanders, An...",[https://lists.linuxfoundation.org/pipermail/b...,"[The email is from Greg, who is responding to ...","""Proposed Modifications to TLUV-ish Script for...",Context 1: Greg is responding to James' idea a...
1,2023-02-27 13:32:01,[bitcoin-dev] BIP proposal: Fee-redistribution...,4,"[Rastislav Budinsky, HcaFc_jbe, shymaa arafat,...",[https://lists.linuxfoundation.org/pipermail/b...,[The author of a Bachelor's thesis proposes a ...,Proposals for Redistributing Transaction Fees ...,The Bachelor's thesis proposes a fee redistrib...
2,2023-03-01 20:18:22,[bitcoin-dev] Minimum fees,3,"[Giuseppe B, Nadav Ivgi, jk_14 at op.pl]",[https://lists.linuxfoundation.org/pipermail/b...,"[In this proposal, the author suggests a new p...","""Exploring the Implementation of Min_Fees Prot...",The proposal suggests a new protocol rule call...
3,2023-02-28 18:07:06,[bitcoin-dev] Refreshed BIP324,2,"[Dhruv M, Erik Aronesty]",[https://lists.linuxfoundation.org/pipermail/b...,[The discussion regarding the use of short 1-b...,"""Bitcoin Development Team Implements Changes t...",The bitcoin development team has implemented c...
4,2023-03-02 06:55:19,[bitcoin-dev] Using service bit 24 for utreexo...,3,"[kcalvinalvin, Luke Dashjr, Peter Todd]",[https://lists.linuxfoundation.org/pipermail/b...,[The writer of the email plans to use service ...,"""Using Service Bit 24 for Utreexo Capable Node...",The email writer plans to use service bit 24 t...


time: 22.8 ms (started: 2023-03-03 11:04:57 +00:00)


## Save HTML

In [99]:
def save_html_file(df_week_generated, save_file_name):
    # open html
    file_handle = open(f"output/{save_file_name}", "w")

    html_title = "Sample Newsletter"
    html = f'''<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <title>{html_title}</title>
    <link rel="stylesheet" href="style.css">
    </head>
    <body>
    <h1 style="text-align:center; font-family:verdana" >Hello World!</h1>
    <br>
    '''
    file_handle.write(html)

    for idx, row in df_week_generated.iterrows():
        # get data
        subject = row.subject
        date = row.date
        num_replies = row.num_replies
        authors = row.authors
        urls = row.urls
        title = row.consolidated_title
        summary = row.consolidated_summary

        # write subjects and all
        html = f"<hr style='border-top: dotted 2px; '><h2 style='text-align:center; font-family:verdana;'>{subject}</h2><b>Date: </b><i>{date}</i><p>Number of replies: {num_replies}</p>"
        file_handle.write(html)

        # write title and summary
        html = f"<h3 style='text-align:center; font-family:verdana; color:#282828;'>{title}</h3><p>{summary}</p><br><b>References:</b>"
        file_handle.write(html)

        for i in range(len(urls)):
            author = authors[i]
            url = urls[i]
            html = f"<ul><li>{author}: <a href='{url}'>{subject}</a></li></ul>"
            file_handle.write(html)
        
        html = f"<br>"
        file_handle.write(html)

    html="</body></html>"
    file_handle.write(html)
    file_handle.close()

    return f"output/{save_file_name}.html"


time: 6.14 ms (started: 2023-03-03 11:43:49 +00:00)


In [86]:
save_html_file(df_week_generated, 'chatgpt_newsletter_03_mar')

'File saved at path: /output/chatgpt_newsletter_03_mar.html'

time: 8.59 ms (started: 2023-03-03 11:07:59 +00:00)


In [88]:
import IPython
IPython.display.HTML(filename="output/chatgpt_newsletter_03_mar.html")

time: 13.5 ms (started: 2023-03-03 11:09:05 +00:00)


# Tldr;

In [100]:
mailing_list_base_url = "https://lists.linuxfoundation.org/pipermail/bitcoin-dev"

all_email_urls = collect_email_urls(mailing_list_base_url)
df_week = scrape_email_urls(all_email_urls)

df_week_generated = generate_newsletter_completion(df_week)

save_file_name = f"html_newsletter_{CURRENT_TIMESTAMP}.html"
html_file_path = save_html_file(df_week_generated, save_file_name)

working on: https://lists.linuxfoundation.org/pipermail/bitcoin-dev/2023-March/
working on: https://lists.linuxfoundation.org/pipermail/bitcoin-dev/2023-February/
Fetched Urls: 136


100%|██████████| 136/136 [01:01<00:00,  2.22it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_week['tokens'] = df_week['email'].apply(lambda x: len(tokenizer.encode(x)))


Number of threads found: 5
-----
working on subject: [bitcoin-dev] BIP for OP_VAULT


100%|██████████| 4/4 [00:21<00:00,  5.26s/it]


-----
working on subject: [bitcoin-dev] BIP proposal: Fee-redistribution contracts


100%|██████████| 4/4 [00:15<00:00,  3.96s/it]


-----
working on subject: [bitcoin-dev] Minimum fees


100%|██████████| 3/3 [00:11<00:00,  3.98s/it]


-----
working on subject: [bitcoin-dev] Refreshed BIP324


100%|██████████| 2/2 [00:04<00:00,  2.14s/it]


-----
working on subject: [bitcoin-dev] Using service bit 24 for utreexo signaling in testnet and signet


100%|██████████| 3/3 [00:06<00:00,  2.20s/it]


-----
time: 2min 24s (started: 2023-03-03 11:44:09 +00:00)
