## Récupération de structure Hn des concurents, en ligne, à partir du mot clé

In [7]:
import os
from langchain.utilities.tavily_search import TavilySearchAPIWrapper
from langchain.agents import initialize_agent, AgentType
from langchain_community.chat_models import ChatOpenAI
from langchain.tools.tavily_search import TavilySearchResults
from langchain.tools import StructuredTool
import requests
import json
import configparser
config = configparser.ConfigParser()
config.read('/Users/aba/Documents/dev/conf.ini')
# set up API key
os.environ["TAVILY_API_KEY"] = config['creds']['TAVILY_API_KEY'] 


In [11]:
from model import get_agent

In [22]:
# set up the agent
search = TavilySearchAPIWrapper()
tavily_tool = TavilySearchResults(api_wrapper=search)


def get_semrank_result(keyword: str) -> dict:
    """ Search for Semrank result on specific keywrods and parse it"""
    r = requests.post(url="https://semrank.io/admin/api/keywords", data={"query":keyword})
    return json.loads(r.content)


def get_hn_structures(keyword: str) -> dict:
    """Retrieves Hn structures from top 3 website of Semrank's results about specific keyword"""
    results = get_semrank_result(keyword)
    a = {}
    for _, concurrent in results["datas"]["concurrents"].items():
        if concurrent["position"]<=3:
            a[concurrent["url"]] = concurrent["headings"]
            a["title"] = concurrent["title"]
            a["descr"] = concurrent["descr"]
    
    return {"result" : results, "parsed": a}


tool_hn= StructuredTool.from_function(get_hn_structures)


tools = [tavily_tool, tool_hn]

In [23]:
agent = get_agent(tools)

In [121]:
key_word = "Comment faire une signature électronique"
target_url = "https://www.docaposte.com/blog/article/comment-creer-signature-electronique-document"

In [25]:
res = agent({"input": key_word})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe human is asking how to create an electronic signature. I can provide a general process for creating an electronic signature. However, the specifics can vary depending on the platform or software they're using. It may be helpful to ask for more details. 

Action:
```
{
  "action": "Final Answer",
  "action_input": "Pour créer une signature électronique, les étapes générales suivantes sont généralement suivies : [0m

[1m> Finished chain.[0m


In [122]:
import requests
from bs4 import BeautifulSoup

def get_hn_structure_and_content(url):
    response = requests.get(url)
    response.raise_for_status()  # Vérifie si la requête a réussi

    soup = BeautifulSoup(response.content, 'html.parser')

    hn_structure = []
    current_section = None

    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'ul']):
        if tag.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if current_section:
                hn_structure.append(current_section)
            current_section = (tag.name.upper(), tag.get_text(strip=True), "")
        elif current_section:
            current_section = (current_section[0], current_section[1], current_section[2] + "\n" + tag.get_text(strip=True))

    if current_section:
        hn_structure.append(current_section)

    return hn_structure

# Exemple d'utilisation
hn_structure = get_hn_structure_and_content(target_url)

#print("Structure Hn et contenu:")
#for tag, title, content in hn_structure:
#    print(f"{tag}: {title}\nContenu:\n{content}\n")


In [135]:
target_hn_structure = [(tag, title) for tag, title, content in hn_structure]
target_hn_structure

[('H1', 'Comment créer une signature électronique sur un document ?'),
 ('H2', 'Qu’est-ce qu’une signature électronique ?'),
 ('H2', 'Quels types de documents peut-on signer électroniquement ?'),
 ('H2', 'Quels sont les avantages de la signature électronique ?'),
 ('H2', 'Les différentes étapes pour créer une signature électronique'),
 ('H2', 'Les différents moyens de signer électroniquement un document'),
 ('H2', 'La gamme de signature électronique de Docaposte'),
 ('H2', 'Dans l’actualité')]

In [26]:
res

{'input': 'Comment faire une signature électronique',
 'output': 'The human is asking how to create an electronic signature. I can provide a general process for creating an electronic signature. However, the specifics can vary depending on the platform or software they\'re using. It may be helpful to ask for more details. \n\nAction:\n```\n{\n  "action": "Final Answer",\n  "action_input": "Pour créer une signature électronique, les étapes générales suivantes sont généralement suivies : ',
 'intermediate_steps': []}

In [20]:
"related"
res["intermediate_steps"]

[(AgentAction(tool='tavily_search_results_json', tool_input={'query': 'how to create an electronic signature'}, log='I need to find accurate and reliable information on how to create an electronic signature. The tavily_search_results_json tool seems appropriate in this case, as it allows me to search for detailed information about the question. I\'ll use the query "how to create an electronic signature" for the search engine.\n\nAction:\n```\n{\n  "action": "tavily_search_results_json",\n  "action_input": {"query": "how to create an electronic signature"}\n}\n```'),
  [{'url': 'https://signaturely.com/online-signature/',
    'content': 'Your personality based on signature traits\nUnreadable letters\nEasy to read\nClear first name, unreadable last name\nWith underline\nEnds with a flick\nUpward angle\nDownward angle\nSlanted\nJust a nickname\nInitials only\nBold capitals\nNo last name\nLarge letters\nEmbellished letters\nThe signatures of successful business people\nReady to create your

In [None]:
#################################

In [29]:
len(results["datas"])

12

In [75]:
%%time
results = get_semrank_result(key_word)

CPU times: user 50.7 ms, sys: 51 ms, total: 102 ms
Wall time: 10.1 s


In [76]:
len(results["datas"]['concurrents'])

9

In [123]:
def parse_semrank_object(results, top_concurrent = 3):
    
    # 1. related questions
    related_questions = [i["question"] for _,i in results["datas"]["paa"].items()] 
    other_related_questions = [i["query"] for _,i in results["datas"]['related'].items()]
    
    # 2. Concurrent + docapposte data content => Title, snippet, Content, Hn structure
    list_concurr = results["datas"]['concurrents']
    concurrents_data = []
    positions = sorted([c["position"] for _, c in results["datas"]["concurrents"].items()])
    docaposte_data = []
    all_url = []
    
    ## Data on "concurrents" key
    for _, concurrent in results["datas"]["concurrents"].items():
        a = {}
        all_url.append(concurrent["url"])
        a["position"] = concurrent["position"]
        a["title"] = concurrent["title"]
        a["snippet"] = concurrent["snippet"]
        a["url"] = concurrent["url"]
        a["descr"] = concurrent["descr"]
        a["headings"] = concurrent["headings"]
        a["nb_words"] = concurrent["nb_words"]
        a["content"] = concurrent["content"]
        if (concurrent["position"] in positions[:top_concurrent+1] and 
            len(concurrents_data) < top_concurrent and 
            "docaposte" not in concurrent["url"]) :
            concurrents_data.append(a)
        if "docaposte" in concurrent["url"]:
            docaposte_data.append(a)
    
    ## Data on 'backlinks' key
    for _, b in results["datas"]["backlinks"].items():
        a = {}
        'title', 'headings', 'snippet', 'url', 'position'
        a["position"] = b["position"]
        a["title"] = b["title"]
        a["snippet"] = b["snippet"]
        a["url"] = b["url"]
        a["headings"] = b["headings"]
        a["nb_words"] = 0
        a["content"] = ""
        if ("docaposte" not in b["url"]) :
            concurrents_data.append(a)
        else:
            docaposte_data.append(a)
    
    
    obj = {'related_questions' : related_questions, 'other_related_questions' : other_related_questions, 
           'docaposte_data' : docaposte_data, 'concurrents_data' : concurrents_data, 'all_url' : all_url}
    return obj  

In [129]:
def parse_hn_from_dict(obj):
    
    # res = ['<' + h['type'] + '>' + h['text'] + '<' + h['type'] + '/>'  for _, h in obj.items() ]
    res = [ (h['type'], h['text'])  for _, h in obj.items() ]
    
    return res #'  \n'.join(res)


In [130]:
parsed_res = parse_semrank_object(results, top_concurrent = 3)

In [131]:
#parsed_res

In [133]:
parse_hn_from_dict(parsed_res["concurrents_data"][0]["headings"])
#parsed_res["concurrents_data"][0]["headings"]

[('h1', 'Comment créer une signature électronique ?'),
 ('h1', 'Comment créer une signature électronique ?'),
 ('h2', 'Création d’une signature électronique'),
 ('h2', 'Comment apposer une signature électronique sur un document ?'),
 ('h3', 'Cliquez sur le lien « vérifier et signer » dans l’e-mail.'),
 ('h3', '1. Cliquez sur le lien « vérifier et signer » dans l’e-mail.'),
 ('h3', 'Cliquez sur le document.'),
 ('h3', '2. Cliquez sur le document.'),
 ('h3', 'Créez une signature électronique.'),
 ('h3', '3. Créez une signature électronique.'),
 ('h3', 'Sélectionnez le format de signature.'),
 ('h3', '4. Sélectionnez le format de signature.'),
 ('h3', 'Signez le document.'),
 ('h3', '5. Signez le document.'),
 ('h3', 'Confirmez la signature.'),
 ('h3', '6. Confirmez la signature.'),
 ('h3', 'Envoyez.'),
 ('h3', '7. Envoyez.'),
 ('h3', 'Que sont les signatures électroniques ?'),
 ('h2', 'Comment signer un PDF électroniquement avec Acrobat Sign ?'),
 ('h3',
  'Il est très facile de signer u

In [None]:
### - Trouver structure Hn d'un article déjà existant avec Docaposte + l'article
### - Prompt tuning : faire un prompt avec les top N structure Hn des concurrents + contenu et demander au LLM de proposer un nouveau pour Doca

In [136]:
results['datas'].keys()

dict_keys(['len_serper', 'len_contents', 'len_backlinks', 'len_backlinks_content', 'keywords_list', 'density', 'urls', 'paa', 'related', 'concurrents', 'backlinks', 'pytime'])

In [98]:
results['datas']['concurrents'].keys()

dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8'])

In [106]:
results['datas']['backlinks'].keys()

dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])

In [102]:
results['datas']['backlinks']['0'].keys()

dict_keys(['title', 'headings', 'snippet', 'url', 'position'])

In [82]:
results

{'status': 'success',
 'datas': {'len_serper': 3,
  'len_contents': 9,
  'len_backlinks': 19,
  'len_backlinks_content': 10,
  'keywords_list': {'0': 'signature:27.555555555555557',
   '1': 'électronique:16.666666666666664',
   '2': 'document:10.11111111111111',
   '3': 'signer:7.666666666666667',
   '4': 'documents:7.222222222222222',
   '5': 'pdf:4.111111111111111',
   '6': 'image:4.111111111111111',
   '7': 'sélectionnez:4.111111111111111',
   '8': 'niveau:3.0',
   '9': 'signatures:3.7777777777777777',
   '10': 'signé:3.6666666666666665',
   '11': 'signataire:3.6666666666666665',
   '12': 'certificat:3.5555555555555554',
   '13': 'solution:3.3333333333333335',
   '14': 'insérer:3.3333333333333335',
   '15': 'simple:3.2222222222222223',
   '16': 'étape:3.2222222222222223',
   '17': 'ligne:3.111111111111111',
   '18': 'sécurité:3.111111111111111',
   '19': 'cliquez:2.0',
   '20': 'numérique:2.888888888888889',
   '21': 'électroniques:2.888888888888889',
   '22': 'processus:2.777777777