In [1]:
import praw
import pandas as pd

reddit = praw.Reddit(
    client_id='',         # Replace with your client ID
    client_secret='', # Replace with your client secret
    user_agent=''      # Replace with a user agent (a string)
)


# Keywords to search for subreddits
keywords = ['customer service automation','GenAI in contact center','customer service','generative ai']
posts = []

def search_subreddits(keywords):
    """Search for subreddits that match the given keywords."""
    subreddit_list = set()
    for keyword in keywords:
        for submission in reddit.subreddit('all').search(keyword, limit=5):  # Limit to avoid excessive API calls
            subreddit_list.add(submission.subreddit.display_name)
    return list(subreddit_list)

def is_valid_subreddit(subreddit):
    """Check if a subreddit exists and is public."""
    try:
        subreddit_instance = reddit.subreddit(subreddit)
        # Check if the subreddit is private or doesn't exist
        return not subreddit_instance.over18 and subreddit_instance.display_name is not None
    except Exception as e:
        print(f"Error checking subreddit {subreddit}: {e}")
        return False

def scrape_subreddit(subreddit):
    """Scrape all posts and comments from a valid subreddit."""
    print(f"Accessing subreddit: {subreddit}")
    try:
        # Attempt to fetch new submissions
        for submission in reddit.subreddit(subreddit).new(limit=10):  # Adjust limit as needed
            submission.comments.replace_more(limit=0)  # Fetch all comments
            for comment in submission.comments.list():
                posts.append({
                    'title': submission.title,
                    'url': submission.url,
                    'score': submission.score,
                    'created': submission.created_utc,
                    'subreddit': subreddit,
                    'submission_content': submission.selftext,
                    'comment': comment.body,
                    'comment_author': comment.author.name if comment.author else "Unknown"
                })
        print(f"Successfully scraped posts and comments from {subreddit}.")
    except praw.exceptions.RedditAPIException as api_error:
        print(f"Reddit API error while accessing {subreddit}: {api_error}")
    except Exception as e:
        print(f"An error occurred while accessing {subreddit}: {e}")

# Search for relevant subreddits
found_subreddits = search_subreddits(keywords)

# Scraping Reddit
for subreddit in found_subreddits:
    if is_valid_subreddit(subreddit):
        scrape_subreddit(subreddit)
    else:
        print(f"Subreddit {subreddit} is invalid or inaccessible.")

# Convert to DataFrame
reddit_df = pd.DataFrame(posts)

# Store results in an Excel file
output_file = 'reddit_posts_1.xlsx'
reddit_df.to_excel(output_file, index=False)

# Display the results
print(f"Data saved to {output_file}.")
print(reddit_df)


Accessing subreddit: classicwow
Successfully scraped posts and comments from classicwow.
Accessing subreddit: ArtificialInteligence
Successfully scraped posts and comments from ArtificialInteligence.
Accessing subreddit: europe
Successfully scraped posts and comments from europe.
Accessing subreddit: lifehacks
Successfully scraped posts and comments from lifehacks.
Accessing subreddit: Millennials
Successfully scraped posts and comments from Millennials.
Accessing subreddit: OculusQuest
Successfully scraped posts and comments from OculusQuest.
Accessing subreddit: instacart
Successfully scraped posts and comments from instacart.
Accessing subreddit: PetPeeves
Successfully scraped posts and comments from PetPeeves.
Accessing subreddit: nvidia
Successfully scraped posts and comments from nvidia.
Accessing subreddit: LifeProTips
Successfully scraped posts and comments from LifeProTips.
Accessing subreddit: mathmemes
Successfully scraped posts and comments from mathmemes.
Accessing subredd

In [2]:
import requests
import pandas as pd
import json

# List of keywords to search
keywords = ["customer service", "machine learning", "generative ai", "chatbot services"]

# Function to extract the required fields from each item in the data
def extract_news_data(data):
    news_data = []
    try:
        for item in data['hits']:
            # Use .get() to safely access fields that might be missing
            title = item.get('title', 'No Title Available')
            story_date = item.get('created_at', 'No Date Available')
            author = item.get('author', 'Unknown Author')
            url = item.get('url', 'No URL Available')
            
            # Get matched keywords and full text with highlights removed, with safe access
            matched_keywords = item.get('_highlightResult', {}).get('title', {}).get('matchedWords', [])
            news_text = item.get('_highlightResult', {}).get('title', {}).get('value', '').replace("<em>", "").replace("</em>", "")

            # Append the extracted data as a dictionary to the list
            news_data.append({
                "News Title": title,
                "News Text": news_text,
                "Story Date": story_date,
                "Matched Keywords": ", ".join(matched_keywords),
                "URL": url,
                "Author Name": author
            })
    except KeyError as e:
        print(f"KeyError: {e}")
    return news_data

# Collect data from the API for each keyword
final_news_data = []
for keyword in keywords:
    url = f"https://hn.algolia.com/api/v1/search?query={keyword}"
    response = requests.get(url)
    
    # Check if request was successful
    if response.status_code == 200:
        data = response.json()
        
        # Append extracted data to final_news_data
        final_news_data.extend(extract_news_data(data))
    else:
        print(f"Failed to fetch data for keyword: {keyword}")

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(final_news_data)

# Save the DataFrame to a CSV file
output_file = "hacker_news_search_results.csv"
df.to_csv(output_file, index=False)

print(f"Data has been saved to {output_file}")


Data has been saved to hacker_news_search_results.csv
