# **Crawling some of the gov.tn websites**

## **santetunisie.rns.tn**

In [None]:
import requests
from bs4 import BeautifulSoup

# Get the HTML content of the target page
url = "http://www.santetunisie.rns.tn/fr/questions-frequentes"
html = requests.get(url).content

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(html, "html.parser")

# Find all the question and answer pairs using the appropriate HTML tags and attributes
qa_pairs = soup.find_all(class_="jp-accordion-item")

# Extract the text of each question and answer and store them in a list or dictionary
qas = []
for pair in qa_pairs:
    question = pair.find(class_ ="ui-accordion-header ui-helper-reset ui-state-default ui-corner-all").get_text()
    answer = pair.find(class_ ="ui-accordion-content ui-helper-reset ui-widget-content ui-corner-bottom").get_text()
    qas.append({"question": question, "answer": answer})

# Print the extracted question and answer pairs
for qa in qas:
    print("Question: ", qa["question"])
    print("Answer: ", qa["answer"])


In [None]:
import pandas as pd


questions_list=[]
answers_list=[]

page_url = "http://www.santetunisie.rns.tn/fr/questions-frequentes"
response = requests.get(page_url)
html_content = response.content
soup = BeautifulSoup(html_content, 'html.parser')


# Scrape questions and answers
questions = soup.find_all(attrs={"class": "ui-accordion-header ui-helper-reset ui-state-default ui-corner-all"})
answers = soup.find_all(attrs={"class": "ui-accordion-content ui-helper-reset ui-widget-content ui-corner-bottom"})

# Remove html tags
questions = [question.text.strip() for question in questions]
answers = [answer.text.strip() for answer in answers]

# Create Q&A lists
questions_list.append(questions)
answers_list.append(answers)

# Flatten the lists
questions_list = [item for sublist in questions_list for item in sublist]
answers_list = [item for sublist in answers_list for item in sublist]

In [None]:
QA_dict = {'questions':questions_list, 'answers':answers_list}
QA1 = pd.DataFrame(QA_dict)
QA1.head()

## **finances.gov.tn**

In [None]:
url = 'http://www.finances.gov.tn/fr/faq?body_value=&field_themef_target_id=All&page=0'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')

# Find the last page button and extract the URL
last_page_url  = soup.find('a', attrs={"title": "Aller à la dernière page"}).get('href')

# Extract the maximum page number from the last page URL
max_pages = int(last_page_url.split('=')[-1]) + 1 # +1 (page numeration starts from 0)

In [None]:
questions_list=[]
answers_list=[]

for page in range(max_pages):

    page_url = f'http://www.finances.gov.tn/fr/faq?body_value=&field_themef_target_id=All&page={page}'
    response = requests.get(page_url)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')


    # Scrape questions and answers
    questions = soup.find_all(attrs={"class": "question"})
    answers = soup.find_all(attrs={"class": "reponse"})

    # Remove html tags
    questions = [question.text.strip() for question in questions]
    answers = [answer.text.strip() for answer in answers]

    # Create Q&A lists
    questions_list.append(questions)
    answers_list.append(answers)

# Flatten the lists
questions_list = [item for sublist in questions_list for item in sublist]
answers_list = [item for sublist in answers_list for item in sublist]

In [None]:
QA_dict = {'questions':questions_list, 'answers':answers_list}
QA2 = pd.DataFrame(QA_dict)
QA2.head()

## **environnement.gov.tn**

In [None]:
url = 'http://www.environnement.gov.tn/faq'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')

questions = soup.find_all('h3', attrs={"class": "toggleTrigger"})
answers = soup.find_all('div', attrs={"class": "jpfaqAnswer"})

# Remove html tags
questions_list = [question.text.strip() for question in questions]
answers_list = [answer.text.strip() for answer in answers]

In [None]:
QA_dict = {'questions':questions_list, 'answers':answers_list}
QA3 = pd.DataFrame(QA_dict)
QA3.head()

##**tunisie.gov.tn**

In [None]:
url = 'http://fr.tunisie.gov.tn/7-foire-aux-questions.htm?idtf=7'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')

questions = soup.find_all(attrs={"class": "panel-title"})
answers = soup.find_all('p')

# Remove html tags
questions_list = [question.text.strip() for question in questions]
answers_list = [answer.text.strip() for answer in answers]

In [None]:
QA_dict = {'questions':questions_list, 'answers':answers_list}
QA4 = pd.DataFrame(QA_dict)
QA4.head()

## **social.gov.tn**

In [None]:
url = 'https://www.social.gov.tn/fr/faq?keyword=&service=All&page=0'
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.content, 'lxml')

# Find the last page button and extract the URL
last_page_url  = soup.find('a', attrs={"title": "Go to last page"}).get('href')

# Extract the maximum page number from the last page URL
max_pages = int(last_page_url.split('=')[-1]) + 1 # +1 (page numeration starts from 0)

In [None]:
questions_list=[]
answers_list=[]

for page in range(max_pages):

    page_url = f'https://www.social.gov.tn/fr/faq?keyword=&service=All&page={page}'
    response = requests.get(page_url, verify=False)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')


    # Scrape questions and answers
    questions = soup.find_all('h4', attrs={"class": "field-content c-font-14"})
    answers = soup.find_all('div', attrs={"class": "views-field views-field-field-reponse"})

    # Remove html tags
    questions = [question.text.strip() for question in questions]
    answers = [answer.text.replace(u'\xa0', u' ').replace("\'", "'").strip() for answer in answers]

    # Create Q&A lists
    questions_list.append(questions)
    answers_list.append(answers)

# Flatten the lists
questions_list = [item for sublist in questions_list for item in sublist]
answers_list = [item for sublist in answers_list for item in sublist]

In [None]:
QA_dict = {'questions':questions_list, 'answers':answers_list}
QA5 = pd.DataFrame(QA_dict)
QA5.head()

##**Final output**

In [None]:
final_df = pd.concat([QA1, QA2, QA3,QA4, QA5], ignore_index=True)

In [None]:
final_df

In [None]:
final_df.to_csv('data.csv', index=False)

# **Training, Fine-tuning and evaluating the model**

In [None]:
!pip install transformers

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
import pandas as pd

# load the pre-trained BERT model and tokenizer
model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# load the training data from the CSV file
data = pd.read_csv('data.csv')

# convert the data to BERT-compatible format
inputs = []
for i in range(len(data)):
    context = data.iloc[i]['answers']
    question = data.iloc[i]['questions']
    input_text = f"{question} [SEP] {context}"
    tokenized = tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=512,
        pad_to_max_length=True,
        return_tensors='pt'
    )
    inputs.append(tokenized)

# convert the inputs to PyTorch tensors and create a DataLoader
input_ids = torch.cat([i['input_ids'] for i in inputs], dim=0)
attention_mask = torch.cat([i['attention_mask'] for i in inputs], dim=0)
start_positions = []
end_positions = []
for i in range(len(data)):
    context = data.iloc[i]['answers']
    question = data.iloc[i]['questions']
    start_pos = context.find(question)
    end_pos = start_pos + len(question)
    start_positions.append(start_pos)
    end_positions.append(end_pos)
start_positions = torch.tensor(start_positions)
end_positions = torch.tensor(end_positions)
dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, start_positions, end_positions)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)


# fine-tune the BERT model for question answering
optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()
for epoch in range(3):
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, start_positions, end_positions = batch
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# save the fine-tuned model
model.save_pretrained('qa_model')
tokenizer.save_pretrained('qa_model')


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load the fine-tuned tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("qa_model")
model = AutoModelForQuestionAnswering.from_pretrained("qa_model")

# List of questions to ask the model
questions = [
    "C'est quoi la grippe A(H1N1)?",
    "Who is the health minister of Tunisia ?",
    "Comment vérifie l'administration fiscale"
]

# Loop through each question and ask the model
for question in questions:
    # Tokenize the input
    inputs = tokenizer(question, padding='max_length', truncation=True, max_length=64, return_tensors='pt')

    # Perform the forward pass
    outputs = model(**inputs)

    # Extract the start and end indices of the answer
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    # Decode the answer and print it
    answer_tokens = inputs['input_ids'][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens)
    print(f"Question: {question}")
    print(f"Answer: {answer}")