In [2]:
!pip install -q beautifulsoup4 faiss-cpu scikit-learn sentence-transformers google-generativeai easyocr openai-whisper

In [None]:
import os
import json
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from datetime import datetime
import google.generativeai as genai
from uuid import uuid4
import faiss
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from urllib.parse import urljoin
import easyocr 
import whisper

os.environ["GOOGLE_API_KEY"] = "AIzaSyAG-ekUHZXwJpeHTGKAFyiRD2uKVuYn00"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
gemini = genai.GenerativeModel('models/gemini-1.5-flash-latest')
encoder = SentenceTransformer("all-MiniLM-L6-v2")

2025-06-30 19:06:39.326923: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751310399.349365     152 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751310399.356137     152 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
def extract_article_text_body(url):
    try:
        res = requests.get(url, timeout=10)
        res.raise_for_status() 
        soup = BeautifulSoup(res.text, 'html.parser')

        for script_or_style in soup(['script', 'style']):
            script_or_style.decompose()
            
        main_content_div = soup.find(['article', 'main'], class_=[
            'entry-content', 'article-content', 'post-content', 'main-content',
            'blog-post-content', 'single-post-content', 'content-area'
        ])

        if not main_content_div:
            main_content_div = soup.find('div', id=['content', 'main', 'bodyContent'])
        if not main_content_div:
            main_content_div = soup.find('div', class_=['content', 'post', 'article', 'page'])

        text_parts = []

        page_title = soup.title.string if soup.title else ''
        if page_title:
            text_parts.append(page_title.strip())

        if main_content_div:
            full_text = main_content_div.get_text(separator='\n').strip()
        else:
            full_text = soup.body.get_text(separator='\n').strip()

        cleaned_lines = [line.strip() for line in full_text.splitlines() if line.strip()]
        filtered_lines = []
        for line in cleaned_lines:
            
            if len(line) < 5 and not any(char.isalnum() for char in line): 
                continue
            if "copyright" in line.lower() or "all rights reserved" in line.lower():
                continue 

            filtered_lines.append(line)

        final_text = "\n\n".join(filtered_lines)
        return final_text

    except requests.exceptions.RequestException as e:
        print(f"❌ Network error or invalid URL: {e}")
        return ""
    except Exception as e:
        print(f"❌ Error during parsing for {url}: {e}")
        return ""

def get_user_inputs(mode="profile"):
    if mode == "topic":
        prompt_title = " Provide 1 Topic/Idea for Your New Post:"
        max_posts = 1
    else:
        prompt_title = " Paste 3–5 of your text posts below:"
        max_posts = 5

    method = input(f"Choose Input Type:\n1. Paste Text\n2. Upload File\n3. Enter URL\n4. Use Infographic Image (from dataset)\n5. Use Audio Clip (from dataset)\nEnter choice (1-5): ").strip()
    
    posts = []
    extracted_texts = []

    if method == "1":  # Paste Text
        print(prompt_title)
        for i in range(max_posts):
            post = input(f"Post {i+1}: ").strip()
            if post:
                posts.append(post)
            if mode == "topic" and posts:
                break

    elif method == "2":  # File
        path = input("Enter file path: ").strip()
        try:
            with open(path, 'r') as f:
                lines = [line.strip() for line in f if line.strip()]
                posts.extend(lines[:max_posts])
        except Exception as e:
            print(f"❌ Error: {e}")

    elif method == "3":  # URL
        for i in range(max_posts):
            url = input(f"URL {i+1}: ").strip()
            if not url:
                continue
            print(" Scraping...")
            text_content = extract_article_text_body(url)
            if text_content:
                print("\n Extracted Text from URL:\n" + "-"*50)
                print(text_content[:1000] + ("..." if len(text_content) > 1000 else ""))
                print("-"*50)
                posts.append(text_content)
                extracted_texts.append(text_content)
                if mode == "topic":
                    break
            else:
                print(f"❌ Couldn’t extract content from {url}")

    elif method == "4":  # Image (Infographic)
        print(" Using image from Kaggle input dataset")
        image_path = input("Enter full image path (e.g., /kaggle/input/your-dataset-name/infographic.png): ").strip()
        try:
            import easyocr
            reader = easyocr.Reader(['en'])
            ocr_results = reader.readtext(image_path)
            extracted_text = " ".join([text[1] for text in ocr_results])
            if extracted_text.strip():
                print("\n Extracted Text from Image:\n" + "-"*50)
                print(extracted_text)
                print("-"*50)
                posts.append(extracted_text)
        except Exception as e:
            print(f"❌ OCR Error: {e}")

    elif method == "5":  # Audio
        print(" Using audio file from Kaggle input dataset")
        audio_path = input("Enter full audio file path (e.g., /kaggle/input/your-dataset-name/audio.mp3): ").strip()
        try:
            import whisper
            model = whisper.load_model("base")
            transcription = model.transcribe(audio_path)
            if transcription["text"].strip():
                print("\n Transcribed Text from Audio:\n" + "-"*50)
                print(transcription["text"])
                print("-"*50)
                posts.append(transcription["text"])
        except Exception as e:
            print(f"❌ Audio Transcription Error: {e}")

    else:
        print("❌ Invalid choice.")

    return posts[:max_posts], extracted_texts

#voice calibration
voice_pairs = [
    ("Cognitive Framing", "I like to simplify complexity so people get the ‘aha!’ moment.", "I like to explore nuanced topics, even if they’re a bit messy or open-ended."),
    ("Value Communication", "I try to give readers something they can immediately apply.", "I aim to change how readers think about a topic."),
    ("Emotional Tone", "I prefer to keep a calm, composed tone in my content.", "I’m okay showing strong opinions and emotional highs/lows."),
    ("Communication Texture", "I speak in plain, precise language.", "I enjoy using analogies, metaphors, or playful phrasing."),
    ("Risk Appetite", "I prefer to play it safe — stay useful, stay relevant.", "I’m okay challenging norms or triggering strong reactions."),
    ("Relational Energy", "I tend to speak to the audience like a guide or coach.", "I like writing more like a peer, friend, or co-explorer."),
]

def get_tone_vector():
    tone = {}
    for label, a, b in voice_pairs:
        print(f"\n🎯 {label}:")
        print(f"A: {a}\nB: {b}")
        while True:
            ans = input("Choose A or B: ").strip().upper()
            if ans in ["A", "B"]:
                tone[label] = 0.0 if ans == "A" else 1.0
                break
    return tone

def get_user_metadata():
    niche = input("\nWhat niche do you create content in? ").strip()

    goal_options = ['reach', 'sales', 'trust', 'educate', 'thought leadership']
    print("Select 1–2 primary content goals:")
    for i, opt in enumerate(goal_options):
        print(f"{i+1}. {opt}")
    selected_goals = input("Enter goal numbers (comma-separated): ").strip().split(',')
    goals = [goal_options[int(i)-1] for i in selected_goals if i.isdigit() and 0 < int(i) <= len(goal_options)]

    arch_options = [
        "The Teacher", "The Challenger", "The Storyteller", "The Visionary",
        "The Operator", "The Curator", "The Trend Decoder", "The Builder"
    ]
    print("\n Select 1–2 content archetypes:")
    for i, opt in enumerate(arch_options):
        print(f"{i+1}. {opt}")
    selected_archetypes = input("Enter archetype numbers: ").strip().split(',')
    archetypes = [arch_options[int(i)-1] for i in selected_archetypes if i.isdigit() and 0 < int(i) <= len(arch_options)]

    platform = input("\n Primary platform (LinkedIn, Instagram, etc): ").strip().lower()

    return {"niche": niche, "goals": goals, "platform": platform, "archetypes": archetypes}

#LLM summary
def generate_llm_summary(posts, tone_vec, metadata):
    short_snips = "\n".join([f"{i+1}. {p[:100]}" for i, p in enumerate(posts[:3])])

    compact_prefs = {
        "Nuanced": int(tone_vec.get("Cognitive Framing", 0)),
        "Actionable": int(tone_vec.get("Value Communication", 0) == 0),
        "Emotional": int(tone_vec.get("Emotional Tone", 0)),
        "Analogies": int(tone_vec.get("Communication Texture", 0)),
        "Bold": int(tone_vec.get("Risk Appetite", 0)),
        "Relatable": int(tone_vec.get("Relational Energy", 0)),
    }

    prompt = f"""
Summarize the writing style of this creator,posts written by the same person:{short_snips}

Using both the posts and this style embedding, summarize the user's writing style with:
- Cognitive style (logic-driven vs intuition-driven)
- Energy signature (calm, aggressive, witty, etc.)
- Lexical patterns (word choice, punctuation)
- Narrative identity (e.g., underdog, guide)
- Content archetype (educator, motivator, etc.)
- Audience intent (clients, recruiters, etc.)

Focus on voice, tone, and energy. Conclude with how they typically frame ideas and close posts.
Niche: {metadata['niche']}
Goals: {metadata['goals']}
Archetypes: {metadata['archetypes']}
Preferences: {compact_prefs}
"""

    return gemini.generate_content(prompt).text.strip()

def build_psyprint_dataset(posts, tone_vec, metadata):
    if posts and isinstance(posts[0], list):
        posts = [item for sublist in posts for item in sublist]

    embeddings = encoder.encode(posts, convert_to_numpy=True)
    style_vec = normalize(embeddings.mean(axis=0).reshape(1, -1))[0]

    summary = generate_llm_summary(posts, tone_vec, metadata)
    print("\n📝 Writing Style Summary:\n")
    print(summary)

    profile = {
        "user_id": f"user_{datetime.now().timestamp()}",
        "created_at": datetime.now().isoformat(),
        **metadata,
        "tone_vector": tone_vec,
        "tone_summary": summary,
        "style_embedding": style_vec.tolist(),
        "writing_samples": posts
    }

    dataset_post = {
        "post_id": str(uuid4()),
        "content": " ".join(posts),
        "platform": metadata.get("platform", "unknown"),
        "niche": metadata.get("niche", "unknown"),
        "archetypes": metadata.get("archetypes", []),
        "tone_summary": summary,
        "style_embedding": style_vec.tolist(),
        "tone_vector": tone_vec
    }

    with open("dataset2_user_profiles.json", "a") as f:
        f.write(json.dumps(dataset_post) + "\n")

    print("\nProfile and dataset2 record saved.")
    return summary 

def build_faiss_high_performing(csv_path):
    df_perf = pd.read_csv(csv_path)

    df_perf.columns = [
        "id", "content", "platform", "author_name", "source_link", "format",
        "niche", "meta_niche", "audience_fit", "hook_type", "emotion_type",
        "cta_style", "persuasion_type", "structure_type", "value_delivery_type",
        "content_intent", "engagement_score", "engagement_bucket"
    ]

    df_perf = df_perf[df_perf["engagement_bucket"].str.strip() == "Top 10%"].reset_index(drop=True)

    df_perf["combined_text"] = df_perf["content"].fillna("") + " [Niche: " + df_perf["niche"].fillna("") + "]"

    perf_texts = df_perf["combined_text"].tolist()

    perf_embeddings = encoder.encode(
        perf_texts, convert_to_numpy=True, normalize_embeddings=True
    ).astype('float32')

    dim = perf_embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(dim)
    faiss_index.add(perf_embeddings)

    return faiss_index, perf_texts, df_perf
 

def search_relevant_content(query, user_niche, faiss_index, perf_texts, df_perf, top_n=5):
    query_combined = f"{query} [Niche: {user_niche}]"
    query_emb = encoder.encode([query_combined], convert_to_numpy=True, normalize_embeddings=True).astype('float32')  
    
    distances, indices = faiss_index.search(query_emb, top_n)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if idx < len(perf_texts):
            post_row = df_perf.iloc[idx]
            results.append({
                "similarity": 1 - dist,  
                "content": post_row['content'],
                "engagement_score": post_row['engagement_score'],
                "niche": post_row['niche'],
                "hook_type": post_row['hook_type'],
                "emotion_type": post_row['emotion_type']
            })

    return results

def generate_custom_post(query, retrieved_contents, user_style_summary, platform="LinkedIn"):
    ref_posts = "\n\n".join([f"- {text[:300]}" for text in retrieved_contents])
    prompt = f"""
Task: Ensure the tone, lexicon, energy, and audience intent reflect the user's style. Write a brand new post for {platform} that resonates with the above style and the topic.
Make sure the post is actionable, engaging, and appropriate for {platform}'s audience.
Style: {user_style_summary}
Topic: {query}
Here are high-performing reference posts:
{ref_posts}
"""
    response = gemini.generate_content(prompt)
    return response.text.strip()

def get_user_feedback(generated_post):
    print("\n📝 Generated Post:\n")
    print(generated_post)
    print("\nWhat would you like to do with this post?")
    print("Options: accept / reject / modify_entirely / modify_partially")

    feedback = input("Your choice: ").strip().lower()
    modified_post = ""

    if feedback in ["modify_entirely", "modify_partially"]:
        print("\nPlease provide your modified version of the post below:\n")
        modified_post = input("Modified Post:\n").strip()

    return {
        "feedback": feedback,
        "final_post": modified_post if modified_post else generated_post
    }

def save_interaction_log(user_inputs):
    log_filename = "user_interaction_log.json"
    
    with open(log_filename, "a") as f:
        f.write(json.dumps(user_inputs) + "\n")

    print(f"📁 Interaction saved to {log_filename}")

In [9]:
if __name__ == "__main__":
    CSV_PATH = "/kaggle/input/highest/dataset2_combined_output.csv"
    faiss_high_perf, high_perf_texts, df_perf = build_faiss_high_performing(CSV_PATH)

    print("\n Step 1: Provide your own posts")
    user_posts, _ = get_user_inputs(mode="profile")

    if len(user_posts) < 1:
        print("\n⚠️ Not enough content samples. Please restart and try again.")
        exit()

    tone_vector = get_tone_vector()
    user_meta = get_user_metadata()
    user_style_summary = build_psyprint_dataset(user_posts, tone_vector, user_meta)
    print("\n✅ FAISS index loaded. Ready for topic generation.")

    print("\n🧠 Step 2: Provide your post topic (text, voice, URL, etc.)")
    topic_inputs, _ = get_user_inputs(mode="topic")

    if not topic_inputs:
        print("\n⚠️ No topic provided. Exiting.")
        exit()

    query = topic_inputs[0]
    user_niche = user_meta["niche"]

    matches = search_relevant_content(query, user_niche, faiss_high_perf, high_perf_texts, df_perf, top_n=5)

    print("\n📌 Top relevant posts:\n")
    for i, res in enumerate(matches, 1):
        print(f"{i}. Similarity: {res['similarity']:.3f} | Engagement Score: {res['engagement_score']}")
        print(f"    Niche: {res['niche']} | Hook: {res['hook_type']} | Emotion: {res['emotion_type']}")
        print(f"    → {res['content'][:300]}{'...' if len(res['content']) > 300 else ''}")
        print("-" * 80)

    retrieved_texts = [res['content'] for res in matches]
    generated_post = generate_custom_post(query, retrieved_texts, user_style_summary, user_meta['platform'])

    print("\n📝 Generated Post:\n")
    print(generated_post)

    feedback_data = get_user_feedback(generated_post)

    interaction_log = {
        "timestamp": datetime.now().isoformat(),
        "query": query,
        "user_meta": user_meta,
        "platform": user_meta['platform'],
        "tone_vector": tone_vector,
        "user_style_summary": user_style_summary,
        "retrieved_reference_posts": retrieved_texts,
        "generated_post": generated_post,
        "feedback": feedback_data['feedback'],
        "final_post": feedback_data['final_post']
    }

    save_interaction_log(interaction_log)

Batches:   0%|          | 0/35 [00:00<?, ?it/s]


 Step 1: Provide your own posts


Choose Input Type:
1. Paste Text
2. Upload File
3. Enter URL
4. Use Infographic Image (from dataset)
5. Use Audio Clip (from dataset)
Enter choice (1-5):  4


 Using image from Kaggle input dataset


Enter full image path (e.g., /kaggle/input/your-dataset-name/infographic.png):  /kaggle/input/image-test/ocr1.png



 Extracted Text from Image:
--------------------------------------------------
Find the right person for company. Fast: Linkedln Jobs can help you: Target the most Screen for qualified Review andrate the relevant candidates: applicants: best matches: Linkedl-UC dato anqinsinntsto pictutt opplicant; Filtrr andtalcyour applicznts Mmach %Nur ciitorio ith norson'& sxills, quclihcnticns with Gssessment tocls like FOCe Kussd Ihs 0n3s you considar Gtocrrnco gncl; putlina your jnb scrocting 7unsticns Gnd,kill Onco Unkccln #iiccommcno ont cuelovonimmche M mke Ossassmstis ycurjob post 10 Ceadia cajicnjotioM apply How to post job on Linkedln: KaKeTh [ent Ma Wtt| Cutenet Asnneaan *arnhen 915 100 Start from the Jcbs poge Set your budgel Post und mancge Youi eos allin one place; Hgatfiom Ater "cupost Yullco j0iues Vcu ccn cnuoge dally thc: Linkudln Jobx puge total buudlgel uromnote Yout Fosl wvill aulomalically Laue"ten v0uvereachedvou: lola_buudel ndna Le Pian Mota Iet KiEFit Eit5 Etis FieEtct Jun

Choose A or B:  A



🎯 Value Communication:
A: I try to give readers something they can immediately apply.
B: I aim to change how readers think about a topic.


Choose A or B:  A



🎯 Emotional Tone:
A: I prefer to keep a calm, composed tone in my content.
B: I’m okay showing strong opinions and emotional highs/lows.


Choose A or B:  A



🎯 Communication Texture:
A: I speak in plain, precise language.
B: I enjoy using analogies, metaphors, or playful phrasing.


Choose A or B:  A



🎯 Risk Appetite:
A: I prefer to play it safe — stay useful, stay relevant.
B: I’m okay challenging norms or triggering strong reactions.


Choose A or B:  A



🎯 Relational Energy:
A: I tend to speak to the audience like a guide or coach.
B: I like writing more like a peer, friend, or co-explorer.


Choose A or B:  A

What niche do you create content in?  AI


Select 1–2 primary content goals:
1. reach
2. sales
3. trust
4. educate
5. thought leadership


Enter goal numbers (comma-separated):  1,2



 Select 1–2 content archetypes:
1. The Teacher
2. The Challenger
3. The Storyteller
4. The Visionary
5. The Operator
6. The Curator
7. The Trend Decoder
8. The Builder


Enter archetype numbers:  1

 Primary platform (LinkedIn, Instagram, etc):  LinkedIn


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


📝 Writing Style Summary:

Based on the provided text snippet ("Find the right person for company. Fast: Linkedln Jobs can help you: Target the most Screen for qual"), the creator's writing style can be summarized as follows:

* **Cognitive Style:** Logic-driven. The writing is concise and focuses on presenting facts and solutions (using LinkedIn Jobs).  There's no room for intuition or subjective opinions.

* **Energy Signature:**  Aggressive (in a business sense). The words "Fast" and the implied urgency suggest a direct, results-oriented approach.  It's not calm or witty; it's focused on efficiency.

* **Lexical Patterns:** The writing is telegraphic, using short sentences and minimal punctuation.  Word choice is functional rather than expressive.  There's a noticeable lack of complete sentences and grammatical correctness (e.g., "Screen for qual").  This suggests a fast-paced, possibly rushed style.

* **Narrative Identity:** Guide/Expert. The post implies the creator possesses the

Choose Input Type:
1. Paste Text
2. Upload File
3. Enter URL
4. Use Infographic Image (from dataset)
5. Use Audio Clip (from dataset)
Enter choice (1-5):  1


 Provide 1 Topic/Idea for Your New Post:


Post 1:  job recruitement


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


📌 Top relevant posts:

1. Similarity: 0.215 | Engagement Score: 9214
    Niche: AI | Hook: Relatable Statement | Emotion: Empathy
    → AI has changed job hunting forever. Use the right tools to accelerate your career. Land your dream job today. Use the Ultimate ChatGPT Guide for job hunting. Forget the painful job application process. Make things easier and steal this guide: 1) Critical AI Application Tools - Practice real-life job...
--------------------------------------------------------------------------------
2. Similarity: 0.149 | Engagement Score: 866
    Niche: AI | Hook: Relatable Statement | Emotion: Inspiration
    → Job opportunity: Business Manager at 20-30 LPA, for a startup building the #1 tech talent network on the planet. Be part of a company: ➝ That has grown to Rs 22 crores (~$3 million) in revenues ➝ Will grow to $10 million by next year ➝ With 100,000+ software engineers in our talent network An ideal ...
----------------------------------------------------------

Your choice:  accept


📁 Interaction saved to user_interaction_log.json
