In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import requests
import re
import json

In [5]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

# **Labs**

In [33]:
BASE_URL = 'https://fstt.ac.ma/Portail2023/laboratoires-de-recherche/'

labs_info = []

url = BASE_URL

print(url)
html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}).text
soup = BeautifulSoup(html, 'html.parser')

container = soup.find('div', {'data-id': 'bd7a7fc'})
temp_attributes = container.find_all('section')

for attribute in temp_attributes:
    lab = {}
    lab['nom'] = attribute.find('h4').text
    lab['link'] = attribute.find('div', {'data-wts-link': 'yes'}).get('data-wts-url')
    labs_info.append(lab)
    
print(json.dumps(labs_info, indent=4, ensure_ascii=False))

https://fstt.ac.ma/Portail2023/laboratoires-de-recherche/
[
    {
        "nom": "Computer Science And Smart Systems (C3S)",
        "link": "https://fstt.ac.ma/Portail2023/laboratoire-computer-science-and-smart-systems-c3s/"
    },
    {
        "nom": "Génie Chimique, Biochimique , Modélisation et Valorisation des Ressources (CBM-VR )",
        "link": "https://fstt.ac.ma/Portail2023/laboratoire-genie-chimique-biochimique-modelisation-et-valorisation-des-ressources-cbm-vr/"
    },
    {
        "nom": "Intelligent Automation & BioMedGenomics  (IABL)",
        "link": "https://fstt.ac.ma/Portail2023/laboratoire-intelligent-automation-biomedgenomics-iabl/"
    },
    {
        "nom": "Materials, Systems and Energy Engineering (MaSEEL)",
        "link": "https://fstt.ac.ma/Portail2023/laboratoire-materials-systems-and-energy-engineering-maseel/"
    },
    {
        "nom": "Mathématiques et Applications (LMA)",
        "link": "https://fstt.ac.ma/Portail2023/laboratoire-mathematiques-et

In [34]:
EMAIL_PATTERN = r"\S+@\S+\.\S+"

for lab in labs_info:
    url = lab['link']
    print(url)
    html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}).text
    soup = BeautifulSoup(html, 'html.parser')

    container = soup.find(name='div', attrs={'id': 'elementor-tab-content-1671'})

    temp_attributes = container.find_all(name='p')

    lab['directeur'] = temp_attributes[0].text.strip()
    lab['specialite'] = temp_attributes[1].text.replace('Spécialité : ', '').strip()
    
    if len(temp_attributes) == 4:
        lab['telephone'] = temp_attributes[2].text.replace(u'\xa0', u'').strip()
        lab['email'] = re.findall(EMAIL_PATTERN, temp_attributes[3].text)[0].strip()
    else:
        lab['telephone'] = temp_attributes[3].text.replace(u'\xa0', u'').strip()
        lab['email'] = re.findall(EMAIL_PATTERN, temp_attributes[4].text)[0].strip()

    container = soup.find(name='div', attrs={'id': 'elementor-tab-content-1672'})
    
    temp_attributes = container.find_all(name='a')

    lab['Equipes de recherche'] = []

    for temp_attribute in temp_attributes:
        temp_dict = {}
        temp_dict['label'] = temp_attribute.text.replace(u'\xa0', u'')
        temp_dict['link'] = temp_attribute.get('href')
        
        if temp_dict is not None:
            lab['Equipes de recherche'].append(temp_dict) 

    container = soup.find(name='div', attrs={'id': 'elementor-tab-content-1673'})

    temp_attributes = container.find_all(name='li')
    
    lab['Axe(s) de recherche'] = []

    for temp_attribute in temp_attributes:
        lab['Axe(s) de recherche'].append(temp_attribute.text.replace(u'\xa0', u''))

    container = soup.find(name='div', attrs={'id': 'elementor-tab-content-1676'})
    temp_attributes = container.find(name='tbody').find_all(name='tr')

    lab['Membres'] = []

    for temp_attribute in temp_attributes:
        temp_dict = {}
        temp_row_data = temp_attribute.find_all(name='td')
        if len(temp_row_data) == 0:
            print('Empty row')
            continue            
        temp_dict['nom'] = temp_row_data[0].text + ' ' + temp_row_data[1].text
        temp_dict['email'] = temp_row_data[2].text
        
        lab['Membres'].append(temp_dict)

print(json.dumps(labs_info, indent=4, ensure_ascii=False))

https://fstt.ac.ma/Portail2023/laboratoire-computer-science-and-smart-systems-c3s/
https://fstt.ac.ma/Portail2023/laboratoire-genie-chimique-biochimique-modelisation-et-valorisation-des-ressources-cbm-vr/
https://fstt.ac.ma/Portail2023/laboratoire-intelligent-automation-biomedgenomics-iabl/
https://fstt.ac.ma/Portail2023/laboratoire-materials-systems-and-energy-engineering-maseel/
https://fstt.ac.ma/Portail2023/laboratoire-mathematiques-et-applications/
https://fstt.ac.ma/Portail2023/laboratoire-mecanique-et-genie-civil/
https://fstt.ac.ma/Portail2023/laboratoire-physico-chimie-des-materiaux-substances-naturelles-et-environnement-lamse/
https://fstt.ac.ma/Portail2023/laboratoire-recherche-et-developpement-en-geosciences-appliquees/
[
    {
        "nom": "Computer Science And Smart Systems (C3S)",
        "link": "https://fstt.ac.ma/Portail2023/laboratoire-computer-science-and-smart-systems-c3s/",
        "directeur": "Pr.EL AMRANI CHAKER",
        "specialite": "Informatique",
       

# **Equipes**

In [39]:
BASE_URL = 'https://fstt.ac.ma/Portail2023/equipes-de-recherche/'

equipes_info = []

url = BASE_URL

print(url)
html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}).text
soup = BeautifulSoup(html, 'html.parser')

container = soup.find('div', {'data-id': '78ced8b'})
temp_attributes = container.find_all('section')

for attribute in temp_attributes:
    equipe = {}
    equipe['nom'] = attribute.find('h4').text
    equipe['link'] = attribute.find('div', {'data-wts-link': 'yes'}).get('data-wts-url')
    if equipe['link'] is not None and equipe['link'] == '':
        equipe['link'] = attribute.find('a').get('href')
    equipes_info.append(equipe)
    
print(json.dumps(equipes_info, indent=4, ensure_ascii=False))

https://fstt.ac.ma/Portail2023/equipes-de-recherche/
[
    {
        "nom": "Biochimie et Génétique Moléculaire (UAE/U07FST)",
        "link": "https://fstt.ac.ma/Portail2023/equipe-biochimie-et-genetique-moleculaire/"
    },
    {
        "nom": "Biotechnologies et Génie des Biomolécules (ERBGB)",
        "link": "https://fstt.ac.ma/Portail2023/equipe-biochimie-et-genetique-moleculaire/"
    },
    {
        "nom": "Couches Minces et Nanomatériaux (ERCMN)",
        "link": "https://fstt.ac.ma/Portail2023/equipe-georisques-georessources-g2r-copy/"
    },
    {
        "nom": "DATA & INTELLIGENT SYSTEMS (DIS)",
        "link": "https://fstt.ac.ma/Portail2023/equipe-geoinformation-amenagement-du-territoire-et-environnement-gate/"
    },
    {
        "nom": "Data Science, Artificial Intelligence and Smart Systems (E-DSAI2S)",
        "link": "https://fstt.ac.ma/Portail2023/equipe-geoinformation-amenagement-du-territoire-et-environnement-gate/"
    },
    {
        "nom": "Génie Chimique 

In [40]:
EMAIL_PATTERN = r"\S+@\S+\.\S+"

for equipe in equipes_info:
    url = equipe['link']
    print(url)
    html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}).text
    soup = BeautifulSoup(html, 'html.parser')

    container = soup.find(name='div', attrs={'id': 'elementor-tab-content-1671'})

    temp_attributes = container.find_all(name='p')

    equipe['directeur'] = temp_attributes[0].text.strip()
    equipe['specialite'] = temp_attributes[1].text.replace('Spécialité : ', '').strip()
    
    equipe['telephone'] = temp_attributes[2].text.replace(u'\xa0', u'').strip()
    equipe['email'] = re.findall(EMAIL_PATTERN, temp_attributes[3].text)[0].strip()

    container = soup.find(name='div', attrs={'id': 'elementor-tab-content-1672'})
    
    temp_attributes = container.find_all(name='a')

    container = soup.find(name='div', attrs={'id': 'elementor-tab-content-1672'})

    temp_attributes = container.find_all(name='li')
    
    equipe['Axe(s) de recherche'] = []

    for temp_attribute in temp_attributes:
        equipe['Axe(s) de recherche'].append(temp_attribute.text.replace(u'\xa0', u''))

    container = soup.find(name='div', attrs={'id': 'elementor-tab-content-1676'})
    temp_attributes = container.find(name='tbody').find_all(name='tr')

    equipe['Membres'] = []

    for temp_attribute in temp_attributes:
        temp_dict = {}
        temp_row_data = temp_attribute.find_all(name='td')
        if len(temp_row_data) == 0:
            print('Empty row')
            continue            
        temp_dict['nom'] = temp_row_data[0].text + ' ' + temp_row_data[1].text
        temp_dict['email'] = temp_row_data[2].text
        
        equipe['Membres'].append(temp_dict)

print(json.dumps(equipes_info, indent=4, ensure_ascii=False))

https://fstt.ac.ma/Portail2023/equipe-biochimie-et-genetique-moleculaire/
https://fstt.ac.ma/Portail2023/equipe-biochimie-et-genetique-moleculaire/
https://fstt.ac.ma/Portail2023/equipe-georisques-georessources-g2r-copy/
https://fstt.ac.ma/Portail2023/equipe-geoinformation-amenagement-du-territoire-et-environnement-gate/
https://fstt.ac.ma/Portail2023/equipe-geoinformation-amenagement-du-territoire-et-environnement-gate/
https://fstt.ac.ma/Portail2023/equipe-genie-chimique-et-valorisation-des-ressources-gcvr/
https://fstt.ac.ma/Portail2023/equipe-geoinformation-amenagement-du-territoire-et-environnement-gate-2/
https://fstt.ac.ma/Portail2023/equipe-geoinformation-amenagement-du-territoire-et-environnement-gate-2/
https://fstt.ac.ma/Portail2023/equipe-environnement-marin-et-risques-naturels-eremrn/
https://fstt.ac.ma/Portail2023/equipe-industrial-systems-engineering-and-energy-conversion-iseec/
https://fstt.ac.ma/Portail2023/equipe-industrial-systems-engineering-and-energy-conversion-is

In [45]:
recherche = {}

recherche['laboratoires'] = labs_info
recherche['equipes'] = equipes_info

print(json.dumps(recherche, indent=4, ensure_ascii=False))

with open('data/recherche.json', 'w', encoding='utf-8') as f:
    json.dump(recherche, f, indent=4, ensure_ascii=False)

{
    "laboratoires": [
        {
            "nom": "Computer Science And Smart Systems (C3S)",
            "link": "https://fstt.ac.ma/Portail2023/laboratoire-computer-science-and-smart-systems-c3s/",
            "directeur": "Pr.EL AMRANI CHAKER",
            "specialite": "Informatique",
            "telephone": "Tél : 05 39 39 39 / 54 / 55 GSM : 06 70 47 05 21",
            "email": "celamrani@uae.ac.ma",
            "Equipes de recherche": [
                {
                    "label": "Smart Systems & Emerging Technologies (2set)",
                    "link": "https://fstt.ac.ma/Portail2023/equipe-smart-systems-emerging-technologies/"
                },
                {
                    "label": "Data & Intelligent Systems (Dis)",
                    "link": "https://fstt.ac.ma/Portail2023/equipe-geoinformation-amenagement-du-territoire-et-environnement-gate/"
                },
                {
                    "label": "Data Science, Artificial Intelligence And Smart

# **Actualités**

In [32]:
BASE_URL = 'https://fstt.ac.ma/Portail2023/category/articles/actualites/'

actualites_info = []

url = BASE_URL

print(url)

html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}).text
soup = BeautifulSoup(html, 'html.parser')

container = soup.find('div', {'data-id': '742cae3'})

temp_attributes = container.find_all('div', {'class': 'elementor-post__text'})


for attribute in temp_attributes:
    actualite = {}
    actualite['titre'] = attribute.find('h3', {'class': 'elementor-post__title'}).text.replace('\n', '').replace('\t', '').strip()
    actualite['date'] = attribute.find('span', {'class': 'elementor-post-date'}).text.replace('\n', '').replace('\t', '').strip()
    actualite['link'] = attribute.find('a').get('href')

    actualites_info.append(actualite)

BASE_URL = 'https://fstt.ac.ma/Portail2023/category/articles/actualites/page/'

for i in range(2, 57):
    url = BASE_URL + str(i) + '/'
    print(url)

    html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}).text
    soup = BeautifulSoup(html, 'html.parser')

    container = soup.find('div', {'data-id': '742cae3'})

    temp_attributes = container.find_all('div', {'class': 'elementor-post__text'})

    for attribute in temp_attributes:
        actualite = {}
        actualite['titre'] = attribute.find('h3', {'class': 'elementor-post__title'}).text.replace('\n', '').replace('\t', '').strip()
        actualite['date'] = attribute.find('span', {'class': 'elementor-post-date'}).text.replace('\n', '').replace('\t', '').strip()
        actualite['link'] = attribute.find('a').get('href')

        actualites_info.append(actualite)

print(json.dumps(actualites_info, indent=4, ensure_ascii=False))

https://fstt.ac.ma/Portail2023/category/articles/actualites/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/2/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/3/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/4/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/5/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/6/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/7/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/8/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/9/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/10/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/11/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/12/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/13/
https://fstt.ac.ma/Portail2023/category/articles/actualites/page/14/
https://fstt.ac.ma/Portail2023/category/articles/a

In [33]:
printProgressBar(0, len(actualites_info), prefix = 'Progress:', suffix = 'Complete', length = 150)

for actualite in actualites_info:
    url = actualite['link']
    html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}).text
    soup = BeautifulSoup(html, 'html.parser')

    container = soup.find('div', {'data-id': 'faf7450'})

    if container is None:
        actualite['contenu'] = ''
        continue
    
    # Find all 'a' tags
    a_tags = container.find_all('a')
    # For each 'a' tag
    for a in a_tags:
        # Extract the href attribute and the text
        if a.has_attr('href'):
            href = a['href']
        text = a.text
        if text == '':
            text = href

        # Replace the 'a' tag with the markdown equivalent
        a.replace_with(f'[{text}]({href}) ')

    actualite['contenu'] = container.find('div', {'class': 'elementor-widget-container'}).text.replace('\n', '').replace('\t', '').replace(u'\u2029', '').strip()

    printProgressBar(actualites_info.index(actualite) + 1, len(actualites_info), prefix = 'Progress:', suffix = 'Complete', length = 150)

print(json.dumps(actualites_info, indent=4, ensure_ascii=False))

Progress: |██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% Complete
[
    {
        "titre": "AVIS DE CONCOURS DE RECRUTEMENT DE NEUF (09) MAÎTRES DE CONFÉRENCES",
        "date": "mai 22, 2024",
        "link": "https://fstt.ac.ma/Portail2023/avis-de-concours-de-recrutement-de-trois-09-maitres-de-conferences/",
        "contenu": "[Avis de recrutement des Maîtres de Conférences](https://fstt.ac.ma/Portail2023/wp-content/uploads/2024/05/Avis-de-recrutement-des-Maitres-de-Conferences.pdf)"
    },
    {
        "titre": "APPEL À CANDIDATURES : BOURSES DE MASTER DE L’UNIVERSITÉ DE SALERNO",
        "date": "mai 22, 2024",
        "link": "https://fstt.ac.ma/Portail2023/appel-a-candidatures-bourses-de-master-de-luniversite-de-salerno/",
        "contenu": "L’appel à candidatures pour les bourses offertes par l’Université de Salerne (Salerno) en Italie pour l’année académique 2024/

In [34]:
with open('data/actualitees.json', 'w', encoding='utf-8') as f:
    json.dump(actualites_info, f, indent=4, ensure_ascii=False)

In [36]:
print(globals().keys())

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', 'open', '_', '__', '___', '__vsc_ipynb_file__', '_i', '_ii', '_iii', '_i1', 'webdriver', 'Keys', 'By', 'WebDriverWait', 'EC', 'TimeoutException', 'BeautifulSoup', 'requests', 're', 'json', '_i2', 'BASE_URL', 'actualites_info', 'url', 'html', 'soup', 'container', 'temp_attributes', 'attribute', 'actualite', 'i', '_i3', '_i4', '_i5', 'printProgressBar', '_i6', '_i7', '_i8', '_i9', '_i10', 'test', 'a_tags', 'a', 'href', 'text', 'final_text', '_i11', '_i12', '_i13', '_i14', '_i15', '_i16', '_i17', '_i18', '_i19', '_i20', '_i21', '_i22', '_i23', '_i24', '_i25', '_i26', '_26', '_i27', '_i28', '_i29', '_i30', '_30', '_i31', 'f', '_i32', '_i33', '_i34', '_i35', '_i36'])


# **Clean memory**

In [37]:
del actualites_info
del url
del html
del soup
del container
del temp_attributes
del actualite
del a_tags
del a
del href
del text
del BASE_URL