In [None]:
import csv
import requests
from bs4 import BeautifulSoup
import time

# Define headers for HTTP requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}

def get_page_content(start, max_retries=3, delay=2):
    # Construct the URL with the start parameter
    url = f'https://www.douban.com/group/search?start={start}&cat=1013&q=%E8%B0%B7%E7%88%B1%E5%87%8C'
    
    # Set the headers for the request
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'}

    retries = 0
    while retries < max_retries:
        try:
            # Send an HTTP GET request to the URL
            response = requests.get(url, headers=headers, timeout=30)
            
            # Check if the response was successful
            if response.status_code == 200:
                # Return the parsed HTML content
                return BeautifulSoup(response.content, 'html.parser')
        
        except requests.RequestException:
            # Increment the retry counter and wait before retrying
            retries += 1
            time.sleep(delay)
    
    # Return None if the request fails after the maximum number of retries
    return None

def extract_links(soup):
    # Find the tbody element that contains the links
    tbody = soup.find('tbody')
    
    links = []
    if tbody:
        # Find all <a> elements with href attribute inside the tbody
        for a in tbody.find_all('a', href=True):
            # Append the href value to the links list
            links.append(a['href'])
    
    return links

def get_comments(soup):
    # Find all <p> elements with the class 'reply-content'
    comments = soup.find_all('p', class_='reply-content')
    
    comment_texts = [comment.get_text(strip=True) for comment in comments]
    # Join the comment texts into a single string separated by newlines
    comment_text = "\n".join(comment_texts)
    
    return comment_text

def get_context(url, headers, writer):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract the title
            title = soup.find('h1').get_text(strip=True)
            
            # Find the topic content element
            topic_content = soup.find(class_='rich-content topic-richtext')
            
            if topic_content:
                # Get the text of the topic content
                content_text = topic_content.get_text(strip=True)
            else:
                content_text = ""

            # Call get_comments function to extract comments
            comment_text = get_comments(soup)

            # Write the data to the CSV file
            writer.writerow([title, content_text, comment_text])
        
        else:
            print(f"Failed to retrieve content from {url}")
        
        # Add a delay to prevent frequent requests
        time.sleep(1)
    
    except requests.RequestException as e:
        # Handle request exceptions
        print(f"Error occurred while fetching {url}: {e}")

total_pages = 5  # Set the total number of pages to scrape

all_links = []

# Loop through the range of pages
for page in range(1, total_pages + 1):
    start = 50 * (page - 1)
    
    # Get the HTML content of the page
    soup = get_page_content(start)
    
    if soup:
        # Extract the links from the page
        page_links = extract_links(soup)
        
        # Add the page links to the all_links list
        all_links.extend(page_links)
        
        # Add a delay to prevent frequent requests
        time.sleep(1)
    
    else:
        print(f"Failed to retrieve content from page {page}")

# Filter out the even-indexed links
filtered_links = [link for index, link in enumerate(all_links) if index % 2 == 0]

# Create and write the header row to the CSV file
with open('Gu Eileen.csv', 'w', newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Content', 'Comment'])

# Open the CSV file in append mode and create the writer object
with open('Gu Eileen.csv', 'a', newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)

    # Loop through the filtered links
    for url in filtered_links:
        # Call the get_context function to retrieve and write the data
        get_context(url, headers, writer)

In [None]:
import pandas as pd

# Merging of first acquisition and additional data
merged_Gu = pd.DataFrame()
csv_files = ['Gu Ailing.csv', 'Gu Ailing1.csv']

dfs_to_concat = []

for file in csv_files:
    df = pd.read_csv(file)
    dfs_to_concat.append(df)

merged_Gu = pd.concat(dfs_to_concat, ignore_index=True)

merged_Gu.to_csv('merged_data.csv', index=False)

In [None]:
import pandas as pd
import jieba
from collections import Counter
import re

# Replace with your CSV file path
csv_file_path = 'Gu Eileen.csv'

# Read the CSV file, specifying 'str' as the dtype parameter
df = pd.read_csv(csv_file_path, encoding='utf-8-sig', dtype={'Comment': str})

# Get the column containing post comments
post_comments = df['Comment']

# Function to clean text data
def clean_text(text):
    if isinstance(text, str):
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Remove emoji symbols
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & Punctuation
                           u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Symbols and Pictographs
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Supplemental Symbols and Pictographs
                           u"\U0001FA70-\U0001FAFF"  # Supplemental Symbols and Pictographs
                           u"\U0001FB00-\U0001FBFF"  # Symbols for Legacy Computing
                           u"\U0001F004-\U0001F0CF"  # Miscellaneous Symbols and Pictographs
                           u"\U0001F0D0-\U0001F0FF"  # Miscellaneous Symbols and Pictographs
                           u"\U0001F10D-\U0001F10F"  # Miscellaneous Symbols and Pictographs
                           u"\U0001F170-\U0001F19A"  # Miscellaneous Symbols and Pictographs
                           u"\U0001F200-\U0001F251"  # Miscellaneous Symbols and Pictographs
            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
    
        # Remove non-alphabetic characters and spaces, keep only Chinese characters and English
        text = re.sub(r'[^a-zA-Z\u4e00-\u9fa5\s]', '', text)
        return text
    else:
        # If the data is not a string, you can return an empty string or handle it appropriately
        return ""

# Clean the text in all content columns
df['Cleaned_Comment'] = df['Comment'].apply(clean_text)

cleaned_csv_file_path = 'Cleaned_Gu.csv'
df.to_csv(cleaned_csv_file_path, index=False, encoding='utf-8-sig')