In [1]:
import praw
import requests
import spacy
import re
import ssl
import json
import torch
from bs4 import BeautifulSoup

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA-enabled GPU for processing.")
else:
    device = torch.device("cpu")
    print("CUDA not available, using CPU.")

# Try to load Spacy model, fallback to regex if unavailable
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("[WARNING] Spacy model not found. Using basic regex-based optimization.")
    nlp = None

# Reddit API Credentials
reddit = praw.Reddit(
    client_id="As44RYqjOK4m9FJ53EwC-g",
    client_secret="9jscdaZAODphthPzuVumkAgvH5EYJw",
    user_agent="Narative/0.1 by your_reddit_username"
)

# Fix SSL Issue
try:
    ssl._create_default_https_context = ssl._create_unverified_context
except AttributeError:
    pass

# Step 1: Optimize the query
def optimize_query(query):
    """Optimizes user query using NLP or regex-based keyword extraction."""
    if nlp:
        doc = nlp(query)
        keywords = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
        return " ".join(keywords)
    else:
        # Fallback if Spacy is not installed
        query = query.lower()
        query = re.sub(r"[^a-zA-Z0-9 ]", "", query)  # Remove special characters
        keywords = query.split()  # Basic word split
        return " ".join(keywords)

# Step 2: Find the most relevant subreddit
def find_best_subreddit(query):
    """Finds the most relevant subreddit based on a given query."""
    search_url = f"https://www.reddit.com/search/?q={query}&type=sr"
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(search_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        subreddit_links = soup.find_all('a', href=True)
        for link in subreddit_links:
            href = link['href']
            if '/r/' in href and not href.startswith("https://www.reddit.com/user/"):
                return href.split('/r/')[1].split('/')[0]  # Extract subreddit name
    return "all"  # Default to r/all if no specific subreddit is found

# Step 3: Fetch posts from the best subreddit
def fetch_reddit_posts(query):
    """Fetches top 10 posts from the best subreddit for the query and stores in a JSON file."""
    optimized_query = optimize_query(query)
    best_subreddit = find_best_subreddit(optimized_query)
    print(f"\n🔍 Searching in subreddit: r/{best_subreddit} for '{optimized_query}'\n")
    try:
        subreddit = reddit.subreddit(best_subreddit)
        top_posts = subreddit.search(optimized_query, limit=50)
        results = []
        for post in top_posts:
            post.comments.replace_more(limit=0)  # Load all top-level comments
            comments = [comment.body for comment in post.comments[:50]]  # Get first 50 comments
            results.append({
                "title": post.title,
                "url": post.url,
                "comments": comments
            })
        # Save results to JSON file
        with open("reddit_results.json", "w", encoding="utf-8") as f:
            json.dump(results, f, indent=4, ensure_ascii=False)
        print("✅ Data saved to reddit_results.json")
        return results
    except Exception as e:
        print(f"⚠️ Error fetching posts: {e}")
        return []

# Step 4: Get user query and display results
if __name__ == "__main__":
    user_query = input("Enter your search query: ")
    posts = fetch_reddit_posts(user_query)
    if posts:
        for idx, post in enumerate(posts):
            print(f"\n🔹 {idx+1}. {post['title']}")
            print(f"🔗 URL: {post['url']}")
            print("💬 Top Comments:")
            for comment in post['comments'][:5]:  # Show only first 5 comments in the terminal
                print(f"   - {comment}")
    else:
        print("⚠️ No relevant posts found.")


Using CUDA-enabled GPU for processing.

🔍 Searching in subreddit: r/RoughRomanMemes for 'roman'

✅ Data saved to reddit_results.json

🔹 1. He was a proud Roman
🔗 URL: https://i.redd.it/60dgmpbkxz8e1.jpeg
💬 Top Comments:
   - 
Thank you for your submission, citizen!

[Come join the Rough Roman Forum Discord server!](https://discord.gg/roughromanforum)


*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/RoughRomanMemes) if you have any questions or concerns.*
   - He almost got Caesar killed for basically being married to the wrong girl
   - Yes yes I also love political purges
   - Optimate dogs are barking. Reform the Roman army is what he did, give more rights to Italians he did. When the republic called on him to defeat the Cimbrians, he did. IN THIS HOUSE, GAIUS MARIUS IS A HERO. END OF StORY
   - Sulla was a brutal dictator that did more to destroy the fabric of the republic, in the name of restoring 

In [2]:
import re
import json
import spacy
from collections import Counter
from transformers import pipeline

# Load NLP model for topic detection
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    nlp = None

# Define a helper function to clean text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s\.\,]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

# Extract key topics from text
def extract_topics(text, top_n=5):
    if nlp:
        doc = nlp(text)
        words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        return [word[0] for word in Counter(words).most_common(top_n)]
    return []  # Fallback if Spacy is unavailable

# Function to process and extract key content from JSON data
def preprocess_data(data):
    processed_text = []
    for entry in data:
        key_topics = extract_topics(entry["title"])  # Extract key topics from the title
        text_chunk = f"Title: {clean_text(entry['title'])}. "
        insightful_comments = []
        for comment in entry["comments"]:
            comment_lower = comment.lower().strip()
            if comment_lower in ["[deleted]", "[removed]", "thanks", "lol", "nice"]:
                continue  # Skip irrelevant comments
            insightful_comments.append(clean_text(comment))
        text_chunk += " ".join(insightful_comments[:5])  # Take top 5 meaningful comments
        processed_text.append((text_chunk, key_topics))
    return processed_text

# Splitting text into smaller chunks
def chunk_text(text, max_chunk_size=1024):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        current_length += len(word) + 1  # +1 for space
        if current_length > max_chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word)
        else:
            current_chunk.append(word)
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

# Summarization function
def summarize_text(text_chunks, summarizer, max_length=200, min_length=50):
    summaries = []
    for chunk in text_chunks:
        summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
        summaries.append(summary[0]["summary_text"])
    return summaries

# Load Reddit results
data_file = "reddit_results.json"
with open(data_file, "r", encoding="utf-8") as file:
    data = json.load(file)

# Pre-process text data
processed_data = preprocess_data(data)

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Generate summaries with key topics
final_summaries = []
for text, topics in processed_data:
    chunks = chunk_text(text)
    chunk_summaries = summarize_text(chunks, summarizer)
    full_summary = " ".join(chunk_summaries)
    if topics:
        full_summary = f"Key Topics: {', '.join(topics)}. {full_summary}"
    final_summaries.append(full_summary)

# Save final summaries
with open("reddit_summary.json", "w", encoding="utf-8") as f:
    json.dump(final_summaries, f, indent=4, ensure_ascii=False)

print("✅ Final summarized results saved to reddit_summary.json")





Device set to use cuda:0
Your max_length is set to 200, but your input_length is only 170. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=85)
Your max_length is set to 200, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
Your max_length is set to 200, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your max_length is set to 200, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer

✅ Final summarized results saved to reddit_summary.json


In [5]:
from transformers import pipeline

def summarize_key_points(filename):
    import json
    
    # Load the JSON file
    with open(filename, "r", encoding="utf-8") as file:
        data = json.load(file)

    combined_text = " ".join(data)

    # Limit the input text to 1024 tokens (approx. 4000 characters)
    combined_text = combined_text[:4000]  # Adjust if needed

    # Initialize summarization pipeline
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

    # Summarize in smaller chunks if text is too long
    summary = summarizer(combined_text, max_length=500, min_length=150, do_sample=False)

    return summary[0]["summary_text"]

# Example usage
summary_result = summarize_key_points("reddit_summary.json")
print("Final Summary:", summary_result)


Device set to use cpu


Final Summary: Sulla was a brutal dictator that did more to destroy the fabric of the republic, in the name of restoring it, than any other person did. He almost got Caesar killed for basically being married to the wrong girl. The thing with the gold never happened to Crassus. Its a popular myth. Most of the political power the Emperors had were derived from them being the Tribune of Plebs. But no, Marcus Aurelius apparently wanted some grizzled general fellow to inherit his powers and then pass them onto the kind old men of the senate. We need a good Caesar in Gaul movie. There is now a gaul sub. This is an open rebellion romans.. Thank you for your submission, citizen. Come join the Rough Roman Forum Discord server.
