In [1]:
import re
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
file = 'your_file_path/wp_posts.csv'
df = pd.read_csv(file)

# Filter posts if you didn't use it in SQL
df = df[(df['post_type'] == 'post') & (df['post_status'] == 'publish')]
# Convert to string to avoid error caused by NaN
df['post_title'] = df['post_title'].astype(str)

df[['post_title', 'post_content']].tail(10)

Unnamed: 0,post_title,post_content
1195,"How To Find Your Facebook, Instagram, X/Twitte...",<!-- wp:paragraph -->\n<p>The list below lists...
1196,如何找到自己的Facebook XS Cookie和X/Twitter Auth_Token...,<!-- wp:paragraph -->\n<p>以下清單列舉了幾個社群平台所使用的Coo...
1270,Machine Learning / Deep Learning Algorithm And...,"<!-- wp:list {""ordered"":true} -->\n<ol><!-- wp..."
1271,機器學習/深度學習演算法與它們的產地，工作與面試的救生指南,"<!-- wp:list {""ordered"":true} -->\n<ol><!-- wp..."
1293,Use Botsonic Chatbot With Rest API,"<!-- wp:paragraph -->\n<p><a href=""https://wri..."
1323,Python Pip/Pipenv Explanation And Most Used Co...,<!-- wp:paragraph -->\n<p><code>pip</code> and...
1353,Import A Deep Learning Embedding Model To A Tr...,<!-- wp:paragraph -->\n<p>Importing embeddings...
1365,"Run Schedule Jobs Using Mac Automator, An Alte...",<!-- wp:paragraph -->\n<p>Cron is a well-known...
1373,Run Llama 2 And Other Open-Source LLM In Pytho...,"<!-- wp:heading -->\n<h2 class=""wp-block-headi..."
1378,在本機Python中執行Llama 2與其他開源大型語言模型,"<!-- wp:heading -->\n<h2 class=""wp-block-headi..."


In [3]:
# Content before cleaning
chk = df[df['post_title'].str.contains('Stock Market Prediction With')].reset_index(drop=True)
chk.loc[0, 'post_content']

'<!-- wp:paragraph -->\n<p>Last week was the long weekend for National Day. I didn\'t have any outdoor activities because of the final exams of my Master\'s program and the typhoon. However, I started deploying my project to the Google Cloud Platform in my spare time.</p>\n<!-- /wp:paragraph -->\n\n<!-- wp:paragraph -->\n<p>Previously, I wrote the article\xa0<em><a href="http://localhost:8888/why-is-a-portfolio-website-important-for-a-data-scientist/">Why Is A Portfolio Website Important For A Data Scientist</a></em>\xa0to share why I think side project is important for data scientists. The largest of my side projects is on stock price analysis and prediction using Python and machine learning models. This project started at the end of 2020, from preliminary planning to many technical trials and errors, and finally launched in June this year, during the worst time of COVID-19, and started the following PDCA cycle. This project has allowed me to grow substantially in my professional abil

In [4]:
def extract_wordpress_content(content):
    # Remove WordPress block comments
    content_without_comments = re.sub(r'<!-- /wp:.*? -->', '', content)
    content_without_comments = re.sub(
        r'<!-- wp:.*? -->', '', content_without_comments)

    # Parse the HTML
    soup = BeautifulSoup(content_without_comments, 'html.parser')

    # Extract text from paragraphs and headings
    extracted_text = []
    for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        extracted_text.append(element.get_text())

    # Extract code blocks
    for code_block in soup.find_all('pre', class_='wp-block-code'):
        extracted_text.append(f"Code:\n{code_block.get_text()}")

    # Join the extracted text
    full_text = ' '.join(extracted_text)
    return full_text


def extract_post_content(df=None, file=None, limit=None, output_path=None):
    '''
    Extract post content from a CSV file.
    - Set limit if docuemnts are too many
    - Set output_path to save the content to a file
    '''
    assert df is not None or file is not None, 'Either df or file must be provided'

    if file is not None:
        df = pd.read_csv(file)
        df = (df
              .sort_values(by='post_date', ascending=False)
              .reset_index(drop=True))

    print('Total posts:', len(df))
    if limit:
        df = df.head(limit)
    # Convert to string to avoid error caused by NaN
    df['post_title'] = df['post_title'].astype(str)
    print(df.head())

    post_content = ''
    collected_posts = 0
    for i in range(len(df)):

        # Continue when post_content is NaN
        if df.loc[i, 'post_content'] != df.loc[i, 'post_content']:
            continue

        extracted_content = extract_wordpress_content(
            df.loc[i, 'post_content'])
        post_content += df.loc[i, 'post_title']
        post_content += extracted_content
        post_content += '=' * 20
        collected_posts += 1

    print(f'Successfully collected {collected_posts} posts')
    if output_path:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(post_content)
    return post_content

In [5]:
extracted_content = extract_wordpress_content(chk.loc[0, 'post_content'])
print(extracted_content)

Last week was the long weekend for National Day. I didn't have any outdoor activities because of the final exams of my Master's program and the typhoon. However, I started deploying my project to the Google Cloud Platform in my spare time. Previously, I wrote the article Why Is A Portfolio Website Important For A Data Scientist to share why I think side project is important for data scientists. The largest of my side projects is on stock price analysis and prediction using Python and machine learning models. This project started at the end of 2020, from preliminary planning to many technical trials and errors, and finally launched in June this year, during the worst time of COVID-19, and started the following PDCA cycle. This project has allowed me to grow substantially in my professional ability, and combining my professional ability with my investment ability has been one of the smartest life decisions I have made recently. Side Project Can Go Further With Monetization Capabilities T