# Tester les 2 modèles:

In [None]:
!pip install spacy-transformers

In [None]:
import pandas as pd
from matplotlib import colors as mcolors

import spacy
import spacy_transformers
from spacy import displacy

from IPython.display import display, HTML

In [None]:
#Cette ligne de code utilise le module warnings de Python pour ignorer les avertissements de catégorie DeprecationWarning
import warnings
warnings.filterwarnings('ignore')

In [None]:
model_fact_env=spacy.load('/path_to_/fact_env')
model_best=spacy.load('/path_to_/model-best')

In [None]:
text ="""Natural resources are another critical environmental factor that can have significant impacts on the world. These resources include things like water, minerals, and energy sources such as fossil fuels and renewable energy. Human activities such as mining, logging, and fishing can have negative impacts on these resources, leading to depletion and degradation of natural ecosystems.
Pollution is yet another environmental factor that can have serious consequences for both humans and the environment. Air pollution can lead to respiratory problems, while water pollution can harm aquatic life and make water unsafe for human consumption. The improper disposal of waste and chemicals can also lead to soil contamination and the loss of fertile land.

Human activity is a significant environmental factor that can impact the planet in various ways. For example, deforestation can lead to the loss of biodiversity and contribute to climate change, while urbanization and land use changes can alter ecosystems and fragment habitats. Agriculture and livestock production can also have significant environmental impacts, including soil erosion, water depletion, and pollution.

In conclusion, environmental factors play a crucial role in shaping our world, and their impacts can be far-reaching and long-lasting. It is important that we work to protect and preserve our natural resources, reduce pollution, and mitigate the negative impacts of human activities on the environment, in order to ensure a sustainable future for generations to come."""


In [None]:
var_env_fact=model_fact_env(text)
spacy.displacy.render(var_env_fact, style="ent", jupyter=True)

In [None]:
var_de_best=model_best(text)
spacy.displacy.render(var_de_best, style="ent", jupyter=True)


Après avoir testé les deux modèles, il est clair que le premier modèle est capable de détecter facilement le texte contenant l'entité "ENVIRONMENTAL FACTOR" mais rien d'autre. En revanche, le deuxième modèle est capable de détecter toutes les entités pour lesquelles il a été entraîné sur le corpus NERO, mais il ne détecte pas très bien "ENVIRONMENTAL FACTOR" car cela n'était pas bien couvert dans NERO. C'est pourquoi nous avons décidé de combiner de manière appropriée les deux modèles pour obtenir des résultats plus complets et précis.

# Combiner les deux modèles:

In [None]:
empty = spacy.blank("en")

class NlpCombined:
    def __init__(self, text):
        self.text = text
        
    def entities_dict(self):
        var_env_fact = model_fact_env(self.text)
        var_de_best = model_best(self.text)

        entities_dict = {}
        for ent in var_env_fact.ents:
            entities_dict[ent.text] = ent.label_

        keys_str = ' '.join(entities_dict.keys())

        for ent2 in var_de_best.ents:
            if ent2.text in keys_str:
                pass
            elif ent2.label_=='EnvironmentalFactor':

              entities_dict[ent2.text] ='ENVIRONMENTAL FACTOR'
            else:
              entities_dict[ent2.text] = ent2.label_

        return entities_dict
    
    def display_entities(self):
        doc = empty(self.text)
        html = displacy.render(doc, style="ent")
    
        dic = self.entities_dict()
        
        for key, entity in dic.items():
            html = html.replace(key, f"<span style='background-color:#f0f0e9; border-radius: 10px; padding: 0.5em; margin: 0.2em; font-weight: bold; opacity: 1;'>{key}<span style='font-size: 0.8em; margin-left: 0.5em;'> {entity}</span></span>")
    
        return html
    
    def display_text(self):
        display(HTML(self.display_entities()))


In [None]:
combined = NlpCombined(text)
combined.display_text()

### Version Finale avec les couleurs

In [None]:
df=pd.read_csv('/path_to_/result_entities.csv',usecols=['entity','semantic_class_entity'])
df['semantic_class_entity'].unique().size

102

In [None]:
light_colors = [color for color in mcolors.CSS4_COLORS.values() if mcolors.to_rgb(color)[0] >= 0.1 and mcolors.to_rgb(color)[1] >= 0.1 and mcolors.to_rgb(color)[2] >= 0.1]
light_colors=light_colors[:102]
entities=list(df['semantic_class_entity'].unique())
entities.remove('EnvironmentalFactor')
entities.append('ENVIRONMENTAL FACTOR')
colors  = {k: v for k, v in zip(entities, light_colors)}

In [None]:
empty = spacy.blank("en")

class NlpCombined:
    def __init__(self, text):
        self.text = text
        
    def entities_dict(self):
        var_env_fact = model_fact_env(self.text)
        var_de_best = model_best(self.text)

        entities_dict = {}
        for ent in var_env_fact.ents:
            entities_dict[ent.text] = ent.label_

        keys_str = ' '.join(entities_dict.keys())

        for ent2 in var_de_best.ents:
            if ent2.text in keys_str:
                pass
            elif ent2.label_=='EnvironmentalFactor':

              entities_dict[ent2.text] ='ENVIRONMENTAL FACTOR'
            else:
              entities_dict[ent2.text] = ent2.label_

        return entities_dict
    
    def display_entities(self):
        doc = empty(self.text)
        html = displacy.render(doc, style="ent")
    
        dic = self.entities_dict()
        for key, entity in dic.items():
            html = html.replace(key, f"<span style='background-color:{colors[entity]}; border-radius: 10px; padding: 0.5em; margin: 0.2em; font-weight: bold; opacity: 1;'>{key}<span style='font-size: 0.8em; margin-left: 0.5em;'> {entity}</span></span>")
    
        return html
    
    def display_text(self):
        display(HTML(self.display_entities()))

In [None]:
combined = NlpCombined(text)
combined.display_text()

La solution proposée dans ce code consiste à combiner deux modèles de traitement du langage naturel (NLP) pour améliorer la détection des entités d'un texte. Le premier modèle, "**model_fact_env**", est spécifiquement conçu pour détecter les entités liées à l'environnement. Le deuxième modèle, "**model_best**", est entraîné sur un corpus plus large et peut détecter divers types d'entités. Cependant, il ne fonctionne pas bien pour détecter les entités liées à l'environnement. 

En combinant ces deux modèles, la classe "**NlpCombined**" permet de détecter toutes les entités du texte, y compris les entités liées à l'environnement.

 La méthode "**entities_dict**" crée un dictionnaire d'entités en utilisant les sorties des deux modèles et en combinant les résultats de manière appropriée. 
 
 La méthode "**display_entities**" affiche les entités du texte avec un code couleur en utilisant la bibliothèque displacy.
 
Le texte est coloré selon la nature de l'entité détectée et les entités liées à l'environnement sont spécifiquement mises en évidence en utilisant une couleur différente et une étiquette spécifique. 

La méthode "**display_text**" affiche le texte avec les entités détectées et colorées de manière appropriée.

# Analyse des relations entre les entités de Nero

In [None]:
df

Unnamed: 0,entity,semantic_class_entity
0,Amph2,GP
1,Amph2,GP
2,Amph2,GP
3,Amph2,GP
4,Bcl-2,GP
...,...,...
224598,formaldehyde,NonProteinOrNucleicAcidChemical
224599,CO2,NonProteinOrNucleicAcidChemical
224600,T antigen,Protein
224601,radiation,EnvironmentalFactor


In [None]:
d = {}
for i in df['semantic_class_entity'].unique():
    d[i] = ' '.join(df[df['semantic_class_entity'] == i]['entity'])

df_ent=pd.DataFrame({'text': d}).reset_index().rename(columns={'index': 'entity'})
df_ent

Unnamed: 0,entity,text
0,15466533,gw70_applenvironmicrob_70_10_5916_s_27
1,9371768,gw60_pnas_94_24_12875_s_111
2,AbstractConcept,transport intermediates lifestyle parental pra...
3,AminoAcid,mAb 47 mAb 47 mAb 38C2 HNS peptide Gln-Ser-Tyr...
4,AminoAcidPeptide,peptide N-glycosidase peptide stems of glycan ...
...,...,...
97,protein,Escherichia coli K-12 DNA polymerase I
98,thing,Millennium MLC
99,transcription of,siderophore transporter gene
100,unconjugated,bile pigments


In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import numpy as np
import os

In [None]:
# Global parameters
#root folder
root_folder='/path/'
data_folder_name=''
glove_filename='glove.6B.100d.txt'

#train_filename='train.csv'
# Variable for data directory
DATA_PATH = os.path.abspath(os.path.join(root_folder, data_folder_name))
glove_path = os.path.abspath(os.path.join(DATA_PATH, glove_filename))

# Both train and test set are in the root data directory
train_path = DATA_PATH
test_path = DATA_PATH

#Relevant columns
TEXT_COLUMN = 'text'
TARGET_COLUMN = 'target'

In [None]:
word2vec_output_file = glove_filename+'.word2vec'
glove2word2vec(glove_path, word2vec_output_file)
word2vec_output_file = glove_filename+'.word2vec'
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [None]:
embeddings = []
for text in df_ent['text']:
    words = text.split()
    embedding = np.mean([model.get_vector(w) for w in words if w in model.key_to_index], axis=0)
    embeddings.append(embedding)

df_ent_embeddings = pd.DataFrame(embeddings)
df_ent_embeddings['entity'] = df_ent['entity']

In [None]:
df_ent_embeddings.columns=['numb','entity']

In [None]:
df_ent[df_ent_embeddings['numb'].isna()]

Unnamed: 0,entity,text
0,15466533,gw70_applenvironmicrob_70_10_5916_s_27
1,9371768,gw60_pnas_94_24_12875_s_111
24,G,TNF-alpha dnaA
28,GeographicLocation,United States
83,dr,disopyramide
90,o,SHR
98,thing,Millennium MLC


In [None]:
# on va droppet ca:
df_ent=df_ent[~df_ent_embeddings['numb'].isna()]

In [None]:
embeddings = []
for text in df_ent['text']:
    words = text.split()
    embedding = np.mean([model.get_vector(w) for w in words if w in model.key_to_index], axis=0)
    embeddings.append(embedding)

df_ent_embeddings = pd.DataFrame(embeddings)
df_ent_embeddings.index = list(df_ent['entity'])
df_ent_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
AbstractConcept,-0.091864,0.159307,-0.062938,-0.007006,-0.085520,0.029818,-0.057861,-0.180446,-0.038896,0.126173,...,-0.209241,-0.087427,-0.027975,0.158962,0.074319,0.024703,-0.023114,-0.692475,0.427399,-0.031513
AminoAcid,-0.256504,0.057629,0.044181,-0.217711,0.047636,0.017638,-0.058559,0.243541,-0.338652,0.319878,...,-0.576637,-0.101858,-0.652531,0.008350,-0.054041,-0.160458,0.008326,-0.740322,0.255387,0.034468
AminoAcidPeptide,-0.027397,0.204055,-0.078496,-0.112102,0.045049,0.044676,-0.127014,0.014585,-0.322520,0.080941,...,-0.431000,-0.034106,-0.453061,-0.057560,-0.099685,-0.193987,0.015831,-0.782654,0.232140,-0.073517
Aminoacid,-0.949250,0.086356,-0.351800,-1.158200,0.636980,-0.290160,-0.351210,0.091724,0.322730,0.892570,...,0.120730,0.237900,-0.523050,-0.145590,0.205660,-0.296880,0.276710,-0.767990,-0.474130,0.910230
AnatomicalPart,-0.209551,0.177351,0.128925,-0.019666,-0.018483,0.004032,-0.079002,0.234134,0.161927,0.092760,...,-0.136413,0.008804,-0.145572,0.171670,0.050366,0.095915,0.025975,-0.452918,0.459672,-0.151518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
process,-0.022251,-0.321846,-0.155048,0.078421,0.152749,-0.207378,0.019768,0.217840,0.041364,0.209590,...,-0.518086,0.001662,-0.020433,0.249394,0.521303,-0.339350,0.487638,-0.741018,0.533153,-0.233333
protein,-0.077745,0.008905,-0.439810,0.043380,-1.106305,-0.020698,0.681755,-0.647475,0.016725,-0.119352,...,-0.286000,0.525810,-0.414170,0.053515,0.917945,-0.732505,0.892735,-1.105295,-0.285827,0.069000
transcription of,0.351790,0.023627,-0.216170,-0.179273,-0.122528,-0.339547,0.583263,0.025153,0.205983,-0.106778,...,-0.116122,0.027382,-0.215850,0.237627,0.525497,-0.187607,0.374968,-0.622321,0.064920,-0.184443
unconjugated,-0.046609,-0.253045,-0.471990,-0.930600,0.885735,-0.368425,0.123059,-0.354470,0.520635,0.005636,...,-0.393445,-0.818990,-0.515360,0.054515,-0.264275,-0.037324,-0.993480,-0.194325,0.028385,-0.016775


In [None]:
df_ent_embeddings.T.corr()

Unnamed: 0,AbstractConcept,AminoAcid,AminoAcidPeptide,Aminoacid,AnatomicalPart,Atom,BiologicalProcess,Body part,BodyPart,Cell,...,organism,organism part,p,person,physical phenomenon,process,protein,transcription of,unconjugated,warfarin
AbstractConcept,1.000000,0.568664,0.649949,0.027518,0.811674,0.676890,0.940036,0.683646,0.779029,0.752667,...,0.269957,0.510724,0.490635,0.694352,0.325415,0.685615,0.298040,0.407974,0.268817,0.519734
AminoAcid,0.568664,1.000000,0.895330,0.468261,0.601538,0.705982,0.596485,0.429558,0.554566,0.553451,...,0.314698,0.356321,0.394759,0.281834,0.390439,0.600263,0.520581,0.546595,0.468642,0.295997
AminoAcidPeptide,0.649949,0.895330,1.000000,0.254237,0.678555,0.734828,0.697868,0.462239,0.621964,0.668816,...,0.260495,0.482486,0.403374,0.390659,0.329319,0.655597,0.488842,0.546131,0.384533,0.395239
Aminoacid,0.027518,0.468261,0.254237,1.000000,0.062245,0.124679,0.005668,0.025946,0.074997,0.010351,...,-0.005230,-0.042259,0.099703,-0.285081,0.385430,0.116884,0.197330,0.224100,0.375023,-0.017569
AnatomicalPart,0.811674,0.601538,0.678555,0.062245,1.000000,0.704085,0.866032,0.701412,0.947760,0.802310,...,0.422838,0.595147,0.495054,0.607182,0.317209,0.657977,0.284364,0.362778,0.354358,0.499397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
process,0.685615,0.600263,0.655597,0.116884,0.657977,0.520864,0.818303,0.440144,0.621314,0.775081,...,0.209315,0.491617,0.499336,0.367559,0.314892,1.000000,0.409922,0.544219,0.301710,0.710124
protein,0.298040,0.520581,0.488842,0.197330,0.284364,0.285342,0.328015,0.219589,0.322776,0.382309,...,0.169786,0.283350,0.193654,0.134173,0.293009,0.409922,1.000000,0.506806,0.182314,0.235415
transcription of,0.407974,0.546595,0.546131,0.224100,0.362778,0.335432,0.439452,0.300259,0.379600,0.476649,...,0.146852,0.330460,0.213062,0.078006,0.417466,0.544219,0.506806,1.000000,0.289188,0.435911
unconjugated,0.268817,0.468642,0.384533,0.375023,0.354358,0.346634,0.261956,0.454107,0.416634,0.416669,...,0.270271,0.333935,0.316487,-0.028731,0.271343,0.301710,0.182314,0.289188,1.000000,0.238118


In [None]:
corr_matrix = df_ent_embeddings.T.corr().where(lambda x: abs(x) > 0.95)
corr_matrix = corr_matrix.dropna(how='all', axis=0).dropna(how='all', axis=1)
np.fill_diagonal(corr_matrix.values, np.nan)
stacked_corr = corr_matrix.stack().dropna()
print(stacked_corr.to_string())

BiologicalProcess                  MolecularProcess                     0.960329
                                   Process                              0.964594
BodyPart                           OrganismPart                         0.969355
Chemical                           Molecule                             0.950310
                                   NonProteinOrNucleicAcidChemical      0.974331
                                   SmallMolecule                        0.980168
Duration                           Time                                 0.987412
                                   TimePoint                            0.971931
ExperimentalFactor                 LaboratoryExperimentalFactor         0.985352
                                   NamedEntity                          0.951023
GeographicalLocation               UnproperNamedGeographicalLocation    0.975085
LaboratoryExperimentalFactor       ExperimentalFactor                   0.985352
MedicalDevice               

Nous sommes heureux d'avoir travaillé sur ce projet passionnant de data science et J'aimerai terminer mon projet avec une parole qui me semble de plus en plus vraie avec le temps:

In [3]:
#!pip install pyfiglet
from pyfiglet import Figlet

f = Figlet(font='big')
quote = "Without data you're just another person with an opinion !"
ascii_art = f.renderText(quote)

print(ascii_art)

__          ___ _   _                 _         _       _        
\ \        / (_) | | |               | |       | |     | |       
 \ \  /\  / / _| |_| |__   ___  _   _| |_    __| | __ _| |_ __ _ 
  \ \/  \/ / | | __| '_ \ / _ \| | | | __|  / _` |/ _` | __/ _` |
   \  /\  /  | | |_| | | | (_) | |_| | |_  | (_| | (_| | || (_| |
    \/  \/   |_|\__|_| |_|\___/ \__,_|\__|  \__,_|\__,_|\__\__,_|
                                                                 
                                                                 
                   _              _           _   
                  ( )            (_)         | |  
 _   _  ___  _   _|/ _ __ ___     _ _   _ ___| |_ 
| | | |/ _ \| | | | | '__/ _ \   | | | | / __| __|
| |_| | (_) | |_| | | | |  __/   | | |_| \__ \ |_ 
 \__, |\___/ \__,_| |_|  \___|   | |\__,_|___/\__|
  __/ |                         _/ |              
 |___/                         |__/               
                   _   _                                        