In [None]:
!pip install praw
!pip install keybert
!pip install sentence-transformers
!pip install transformers
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
import praw
from transformers import pipeline
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from collections import Counter
import spacy

# Reddit API setup
reddit = praw.Reddit(
    client_id=input(" enter client_id"),
    client_secret="enter client_secret",
    user_agent="reddit-persona-bot by u/username"
)

# Load spaCy NER model
nlp = spacy.load("en_core_web_sm")

# Get Reddit username
username = input("Enter Reddit username: ")
user = reddit.redditor(username)

# Scrape posts and comments
posts, comments = [], []

# Fetch submissions safely
try:
    for submission in user.submissions.new(limit=50):
        posts.append({
            "text": f"{submission.title} {submission.selftext}",
            "url": f"https://reddit.com{submission.permalink}",
            "subreddit": str(submission.subreddit)
        })
except Exception as e:
    print("Could not fetch submissions:", e)

# Fetch comments safely
try:
    for comment in user.comments.new(limit=50):
        comments.append({
            "text": comment.body,
            "url": f"https://reddit.com{comment.permalink}",
            "subreddit": str(comment.subreddit)
        })
except Exception as e:
    print("Could not fetch comments:", e)

all_texts = posts + comments

# Check if any data was fetched
if not all_texts:
    print(f"❌ No posts or comments found for user '{username}'. Cannot generate a persona.")
else:
    # Combine all text into one summary string
    summary_text = " ".join([entry["text"] for entry in all_texts])

    # Keywords
    kw_model = KeyBERT(model=SentenceTransformer("all-MiniLM-L6-v2"))
    keywords = kw_model.extract_keywords(summary_text, top_n=20, stop_words='english')
    generic_terms = {'india', 'car', 'cars', 'people', 'thing', 'delhi', 'uttar', 'pradesh', 'lko'}
    top_keywords = [kw[0] for kw in keywords if kw[0].lower() not in generic_terms]
    keyword_citations = {kw[0]: [] for kw in keywords if kw[0].lower() not in generic_terms}
    for kw in keyword_citations:
        for entry in all_texts:
            if kw in entry["text"].lower():
                keyword_citations[kw].append(entry["url"])
                break

    # Sentiment
    sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
    raw_label = sentiment_pipeline(summary_text[:512])[0]["label"]
    label_map = {"LABEL_0": "Negative", "LABEL_1": "Neutral", "LABEL_2": "Positive"}
    sentiment = label_map.get(raw_label, "Unknown")
    sentiment_citation = next((entry["url"] for entry in all_texts if entry["text"][:512] in summary_text), 'N/A')

    # Named entities
    doc = nlp(summary_text)
    named_entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "GPE", "PERSON"]]
    unique_entities = list(set(named_entities))

    # Subreddit stats and writing style
    avg_len = sum(len(item["text"].split()) for item in all_texts) / len(all_texts)
    style = "Concise" if avg_len < 15 else "Detailed"
    style_citation = all_texts[0]["url"] if all_texts else 'N/A'
    top_subreddits = Counter([item["subreddit"] for item in all_texts]).most_common(3)
    subreddit_citations = {sub: next((entry["url"] for entry in all_texts if entry["subreddit"] == sub), 'N/A') for sub, _ in top_subreddits}

    # Generate user persona
    persona_text = f"""
User Persona for: u/{username}

🔹 Top Interests:
"""
    for kw in top_keywords:
        url = keyword_citations[kw][0] if keyword_citations[kw] else "N/A"
        persona_text += f"- {kw} (e.g., {url})\n"

    persona_text += f"""
🔹 Named Entities Mentioned: {', '.join(unique_entities[:5]) if unique_entities else 'N/A'}

🔹 Frequent Subreddits:
"""
    for sub, _ in top_subreddits:
        citation = subreddit_citations[sub]
        persona_text += f"- r/{sub} (e.g., {citation})\n"

    persona_text += f"\n🔹 Writing Style: {style} (e.g., {style_citation})"
    persona_text += f"\n🔹 Sentiment: {sentiment} (e.g., {sentiment_citation})\n"

    persona_text += "\n🔹 Sample Evidence:\n"
    for entry in posts[:2] + comments[:2]:
        snippet = entry["text"].replace("\n", " ").strip()
        if len(snippet) > 100:
            snippet = snippet[:97] + "..."
        persona_text += f"• \"{snippet}\" — {entry['url']}\n"

    print(persona_text)

Enter Reddit username: mcmrarm


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

Device set to use cpu



User Persona for: u/mcmrarm

🔹 Top Interests:
- server (e.g., https://reddit.com/r/MCPE/comments/7uaj6p/i_got_a_dedicated_server_running_for_mcpe_am/)
- mcpe (e.g., https://reddit.com/r/MCPE/comments/7uaj6p/i_got_a_dedicated_server_running_for_mcpe_am/)
- skywars (e.g., https://reddit.com/r/MCPE/comments/7uaj6p/i_got_a_dedicated_server_running_for_mcpe_am/)
- dedicated (e.g., https://reddit.com/r/MCPE/comments/7uaj6p/i_got_a_dedicated_server_running_for_mcpe_am/)
- removed (e.g., https://reddit.com/r/MCPE/comments/7uaj6p/i_got_a_dedicated_server_running_for_mcpe_am/)
- got (e.g., https://reddit.com/r/MCPE/comments/7uaj6p/i_got_a_dedicated_server_running_for_mcpe_am/)
- test (e.g., https://reddit.com/r/MCPE/comments/7uaj6p/i_got_a_dedicated_server_running_for_mcpe_am/)
- running (e.g., https://reddit.com/r/MCPE/comments/7uaj6p/i_got_a_dedicated_server_running_for_mcpe_am/)

🔹 Named Entities Mentioned: MCPE & am running

🔹 Frequent Subreddits:
- r/MCPE (e.g., https://reddit.com/r/MCPE/c

In [None]:
# Save persona to text file
with open("persona_output.txt", "w", encoding="utf-8") as f:
    f.write(persona_text)


In [None]:
from google.colab import files
files.download("persona_output.txt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>