# DicoCo

Alban Zurkowski

In [1]:
# import necessaire à la création des widgets
from IPython.display import display, clear_output, HTML, Markdown, Audio
import ipywidgets as widgets
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
#import librosa
#import librosa.display
import numpy as np

os.chdir("/home/jovyan/cnam-athon-pronunciation-main/data/dictionary")

In [2]:
buchanan = pd.read_csv("Buchanan_enrichi.csv", sep=",", encoding="utf-8")

In [3]:
#récupérer les URL des fichiers MP3
def getMP3URL(mot):
    session = HTMLSession()
    response = session.get("https://dictionary.cambridge.org/fr/dictionnaire/anglais/"+mot)
    html_doc = response.content
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    #on prend que les fichiers mp3
    mp3all = soup.findAll(type="audio/mpeg")
    
    uk_pron = "no audio"
    us_pron = "no audio"
    #on ne prend que le premier audio américain et anglais
    uk_found = False
    us_found = False
    for m in mp3all:
        if "uk_pron" in m.get_attribute_list('src')[0] and uk_found == False:
            uk_pron = "https://dictionary.cambridge.org"+m.get_attribute_list('src')[0]
            uk_found = True
        elif "us_pron" in m.get_attribute_list('src')[0] and us_found == False:
            us_pron = "https://dictionary.cambridge.org"+m.get_attribute_list('src')[0]
            us_found = True

    return uk_pron, us_pron

#récupérer un IPA du mot sur le site Cambridge et son "part of speech"
def getIPAPOS(mot):
    session = HTMLSession()
    response = session.get("https://dictionary.cambridge.org/fr/dictionnaire/anglais/"+mot)
    html_doc = response.content
    soup = BeautifulSoup(html_doc, 'html.parser')
    #on récupère la prononciation IPA et on ne renverra que le premier élément
    IPA = soup.findAll("span", {"class": "ipa dipa lpr-2 lpl-1"})
    typeMot = soup.findAll ("span", {"class": "pos dpos"})
    
    POS = []
    ipacontent = []
    
    for motIPA in IPA:
        ipacontent.append(motIPA.text)
        
    for categoryMot in typeMot:
        POS.append(categoryMot.text)
        
    if ipacontent and POS:
        return ipacontent[0], POS[0]
    else:
        return "no info for that word", "no info for that word"

In [4]:
# definition du widget permettant de taper un texte 
textTarget = widgets.Text(
       value="",
       description='Entrer un mot',
       layout={'width': '200px'} )

# definition du widget permettant de taper un texte pour l'ipa
ipaTarget = widgets.Text(
       value="",
       description='IPA',
       layout={'width': '200px'} )

# sélection d'une plage de valeurs
rangeSyllMin = 1
rangeSyllMax = 27
optionsNbSyll = range(1, 27)
wid_nbsyll = widgets.SelectionRangeSlider(
    options=optionsNbSyll,
    index=(min(optionsNbSyll)-1,max(optionsNbSyll)-1),
    description='Nb syllabes',
    disabled=False
)

nbResultMax = 20
wid_nbResult = widgets.IntSlider(
    value=nbResultMax,
    min=1,
    max=nbResultMax,
    step=1,
    description='Nb résultats max.',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

In [8]:
# définition du widget button permettant de lancer l'extraction
button = widgets.Button(description='Démarrer')

# définition du widget permettant l'affichage d'une sortie
cambridge = widgets.Output()
longman = widgets.Output()
linguee = widgets.Output()
tableau = widgets.Output()
cam_audio_uk = widgets.Output()
cam_audio_us = widgets.Output()

def removeRegex(inputString):
    outputstring = ""
    for c in inputString:
        if c.isalpha():
            outputstring = outputstring + c
    return outputstring

# spécifier les actions à réaliser lors de l'appui sur le bouton
def on_button_clicked(_):
    
    path = r"/home/jovyan/cnam-athon-pronunciation-main/data/audio/"
    
    # récupérer le texte saisi par l'utilisateur
    if len(textTarget.value) != 0:
        colToUse = 'transcription'
        targetWord = removeRegex(textTarget.value)
        ipaTarget.value
        ipaTarget.value = getIPAPOS(targetWord)[0]
        
    else:
        colToUse = 'ipa'
        targetWord = ipaTarget.value
        
    
    with cambridge:
        clear_output()
        urlCambridge = "https://dictionary.cambridge.org/fr/dictionnaire/anglais/"
        urlTarget = urlCambridge+targetWord
        display(HTML('<a href="'+urlTarget+'" target="_blank" >'+'Cambridge : '+targetWord+'</a>'))

    with longman:
        clear_output()        
        urlLongman = "https://www.ldoceonline.com/dictionary/"
        urlTarget = urlLongman+targetWord
        display(HTML('<a href="'+urlTarget+'" target="_blank" >'+'longman : '+targetWord+'</a>'))

    with linguee:
        clear_output()
        urlLinguee = "https://www.linguee.fr/anglais-francais/traduction/"
        urlTarget = urlLinguee+targetWord+'.html'
        display(HTML('<a href="'+urlTarget+'" target="_blank" >'+'linguee : '+targetWord+'</a>'))
        
        
    #tableau filtré
    buchananFiltered = buchanan[buchanan[colToUse].str.contains(textTarget.value, na=False, regex=True)]
    buchananFiltered = buchananFiltered[buchananFiltered['sizePron'].between(wid_nbsyll.value[0], wid_nbsyll.value[1], inclusive=True)]


    with tableau:
        clear_output()
        display(buchananFiltered.head(n=wid_nbResult.value))
        
    for wordInfo in buchananFiltered.head(n=wid_nbResult.value).itertuples():
        #tout mettre sur une ligne -- GET IPA / POS
        row = [wordInfo.transcription,
               getIPAPOS(wordInfo.transcription)[0],
               getIPAPOS(wordInfo.transcription)[1],
               "",
               ""]

        #INFOS
        items = [widgets.Label(row[0].upper()+":"), widgets.Label(row[2]), widgets.Label(" - "+row[1])]
        display(widgets.HBox(items))

        #chargement des AUDIOS
        if getMP3URL(wordInfo.transcription)[0] != "no audio":
            row[3] = getMP3URL(wordInfo.transcription)[1]
            audio_uk = True
            #DOWNLOAD MP3
            session_uk = HTMLSession()
            audio = session_uk.get(row[3])
            with open(path+wordInfo.transcription+'_uk.mp3', 'wb') as f:
                f.write(audio.content)
        if getMP3URL(wordInfo.transcription)[1] != "no audio":
            row[4] = getMP3URL(wordInfo.transcription)[1]
            audio_us = True
            #DOWNLOAD MP3
            session_us = HTMLSession()
            audio = session_us.get(row[4])
            with open(path+wordInfo.transcription+'_us.mp3', 'wb') as f:
                f.write(audio.content)

        #AUDIO - player
        if audio_uk and audio_us:
            audio_widgets = []
            for i in range(2):
                if i == 0:
                    #widget audio
                    outAudio = widgets.Output()
                    with outAudio:
                        clear_output()
                        display(Audio(url=row[3]))

                    #widget spectrogram
     #               y, sr = librosa.load(path+wordInfo.transcription+'_uk.mp3')
      #              D = librosa.stft(y)  # STFT of y
       #             S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
#
 #                   outSpec = widgets.Output()
  #                  with outSpec:
   #                     fig, ax = plt.subplots()
    #                    img = librosa.display.specshow(S_db, x_axis='time', y_axis='linear', ax=ax)
     #                   ax.set(title='UK spectrogram')
      #                  fig.colorbar(img, ax=ax, format="%+2.f dB")
       #                 plt.show(fig)

                    #audio_widgets.append(widgets.VBox([widgets.Label("UK"), outAudio, outSpec]))
                    display(widgets.VBox([widgets.Label("UK"), outAudio])) 

                if i == 1:
                    #widget audio
                    outAudio = widgets.Output()
                    with outAudio:
                        clear_output()
                        display(Audio(url=row[4]))

                    #widget spectrogram
     #               y, sr = librosa.load(path+wordInfo.transcription+'_us.mp3')
      #              D = librosa.stft(y)  # STFT of y
       #             S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
#
 #                   outSpec = widgets.Output()
  #                 with outSpec:
   #                   fig, ax = plt.subplots()
    #                    img = librosa.display.specshow(S_db, x_axis='time', y_axis='linear', ax=ax)
     #                  ax.set(title='US spectrogram')
      #                 fig.colorbar(img, ax=ax, format="%+2.f dB")
       #                 plt.show(fig)

                    #audio_widgets.append(widgets.VBox([widgets.Label("US"), outAudio, outSpec]))
                    display(widgets.VBox([widgets.Label("US"), outAudio])) 

            display(widgets.HBox(audio_widgets))

        elif audio_uk:
            #widget audio
            outAudio = widgets.Output()
            with outAudio:
                clear_output()
                display(Audio(url=row[3]))

            #widget spectrogram
  #          y, sr = librosa.load(path+wordInfo.transcription+'_uk.mp3')
   #         D = librosa.stft(y)  # STFT of y
    #        S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
#

 #           outSpec = widgets.Output()
  #          with outSpec:
   #             fig, ax = plt.subplots()
    #            img = librosa.display.specshow(S_db, x_axis='time', y_axis='linear', ax=ax)
     #           ax.set(title='UK spectrogram')
      #          fig.colorbar(img, ax=ax, format="%+2.f dB")
       #         plt.show(fig)

            #display(widgets.VBox([widgets.Label("UK"), outAudio, outSpec]))   
            display(widgets.VBox([widgets.Label("UK"), outAudio])) 


        else:
            #widget audio
            out = widgets.Output()
            with out:
                clear_output()
                display(Audio(url=row[4]))

            #widget spectrogram
  #          y, sr = librosa.load(path+wordInfo.transcription+'_us.mp3')
  #          D = librosa.stft(y)  # STFT of y
  #          S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)


 #           outSpec = widgets.Output()
 #           with outSpec:
 #               fig, ax = plt.subplots()
 #               img = librosa.display.specshow(S_db, x_axis='time', y_axis='linear', ax=ax)
 #               ax.set(title='US spectrogram')
 #               fig.colorbar(img, ax=ax, format="%+2.f dB")
 #               plt.show(fig)

            #display(widgets.VBox([widgets.Label("US"), outAudio, outSpec])) 
            display(widgets.VBox([widgets.Label("US"), outAudio])) 
        
button.on_click(on_button_clicked)

les spectrogrammes n'ont pas l'air de lire de mp3

In [9]:
# définition du widget button permettant de lancer l'extraction
button = widgets.Button(description='Démarrer')

# définition du widget permettant l'affichage d'une sortie
out1 = widgets.Output()
out2 = widgets.Output()
button.on_click(on_button_clicked)

In [10]:
# les widgets de filtre input
vbFiltres = widgets.HBox([textTarget, ipaTarget, wid_nbsyll, wid_nbResult, button])
# les widgets d'output
vbOuts = widgets.VBox([cambridge, longman, linguee, tableau])

hbAllWidgets = widgets.VBox([vbFiltres, vbOuts])
hbAllWidgets

VBox(children=(HBox(children=(Text(value='flower', description='Entrer un mot', layout=Layout(width='200px')),…

HBox(children=(Label(value='FLOWER:'), Label(value='noun'), Label(value=' - flaʊər')))

VBox(children=(Label(value='UK'), Output()))

VBox(children=(Label(value='US'), Output()))

HBox()

HBox(children=(Label(value='GILLIFLOWER:'), Label(value='no info for that word'), Label(value=' - no info for …

VBox(children=(Label(value='UK'), Output()))

VBox(children=(Label(value='US'), Output()))

HBox()

HBox(children=(Label(value='SUNFLOWER:'), Label(value='noun'), Label(value=' - ˈsʌnˌflaʊər')))

VBox(children=(Label(value='UK'), Output()))

VBox(children=(Label(value='US'), Output()))

HBox()

HBox(children=(Label(value='ACCESS:'), Label(value='noun'), Label(value=' - ˈæk.ses')))

VBox(children=(Label(value='UK'), Output()))

VBox(children=(Label(value='US'), Output()))

HBox()

HBox(children=(Label(value='ACCESSIBLE:'), Label(value='adjective'), Label(value=' - əkˈses.ə.bəl')))

VBox(children=(Label(value='UK'), Output()))

VBox(children=(Label(value='US'), Output()))

HBox()

HBox(children=(Label(value='ACCESSION:'), Label(value='noun'), Label(value=' - əkˈseʃ.ən')))

VBox(children=(Label(value='UK'), Output()))

VBox(children=(Label(value='US'), Output()))

HBox()