# Notebook 1: Preprocesamiento de los datos




In [None]:
# Librerías

import pandas as pd
import re
import numpy as np

#from itertools import chain, groupby
from itertools import groupby
from bs4 import BeautifulSoup
from collections import Counter

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import spacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
test = pd.read_csv('/content/drive/MyDrive/TFM_Diabetes/data/DM Dialogs.csv', sep = ";")

In [None]:
test.head(5)

Unnamed: 0,text
0,"Hey, how are you?"
1,Hi! I'm feeling great today.
2,What did you do today?
3,I played a baskeball match this morning.
4,How was it? Did you win?


In [None]:
# Diccionario para expandir las contracciones

contraction_mapping_upper = {"ain't": "is not","can't": "cannot", "'cause": "because", "could've": "could have", 

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have", "mightn't've": "might not have", "must've": "must have",

                           "mustn't've": "must not have", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",

                           "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't've": "will not have",

                           "would've": "would have", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd've": "you would have", "you'll've": "you will have"}

contraction_mapping = dict((k.lower(), v) for k, v in contraction_mapping_upper .items()) # Convertir todas las clave-valor del diccionario a minúsculas

# Stop words: palabras que no tienen un significado por sí solas (artículos, pronombres, preposiciones)
stop_words = set(stopwords.words('english')) 

In [None]:
def clean_conversations(df, remove_stopwords):
  clean = df.lower() #Todo en minúsculas
  clean = re.sub("[^a-zA-Z 0-9 . ?]", " ", clean) #Eliminar símbolos raros, excepto interrogante
  clean = clean.replace("?", " ?") #Añadir un espacio delante de los interrogantes
  clean = clean.replace(".", " .") #Añadir un espacio delante de los puntos

  #Eliminar contracciones
  #clean = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in clean.split(" ")])

  start_words = ["hello ", "hi ", "good morning ", "good afternoon ", "good evening ", 
                 "good night ", "hey ", "yo ", "what's up ? "]
  end_words = ["goodbye ", "bye bye ", "bye ", "see you soon", "you later ", 
               "have a nice day ", "take care ", "good night "]


  #Sustituir saludos por <SOS>
  for hello_word in start_words:
    if hello_word in clean:
      clean = clean.replace(hello_word, "<SOS> ")

  #Sustituir despedidas por <EOS>
  for bye_word in end_words:
    if bye_word in clean:
      clean = clean.replace(bye_word, "<EOS> ")

  #Opcional: eliminar las stop words
  if remove_stopwords:
    tokens = [w for w in clean.split() if not w in stop_words] #Separar en tokens las palabras y eliminar las que sean stop words
  else:
    tokens = [w for w in clean.split()] #Separar en tokens las palabras

  return (" ".join(tokens).strip())

In [None]:
def join_conversations(df):
  conjunto = []
  conversation = []

  i=0
  while i < len(cleaned_text):
    while "<EOS>" not in df[i]:
      conversation.append(df[i])
      i+=1
    else:
      conversation.append(df[i])
      i+=1
      all_conversation = (" ".join(conversation).strip()) #ponerla en el buen formato
      conjunto.append(all_conversation)
      conversation = [] 

  return conjunto

In [None]:
cleaned_text = []
cleaned_textSW = []
for t in test['text']:
  cleaned_text.append(clean_conversations(t, remove_stopwords = False))

clean_dialogues = join_conversations(cleaned_text)


cleaned_text_SW = []
for t in test['text']:
  cleaned_textSW.append(clean_conversations(t, remove_stopwords = True))

clean_dialoguesSW = join_conversations(cleaned_textSW)

In [None]:
test['text']

0                                      Hey, how are you?
1                           Hi! I'm feeling great today.
2                                 What did you do today?
3               I played a baskeball match this morning.
4                               How was it? Did you win?
                             ...                        
302    I should have, but I don't really remember if ...
303    That could explain why you woke up high. You s...
304    Yes, I will. At least this will make my hangov...
305              Definetely. I hope you get better soon.
306                           Thanks, talk to you later.
Name: text, Length: 307, dtype: object

In [None]:
# Inspeccionar las primeras conversaciones
for i in range(3):
    print("Conversación #",i+1)
    print(clean_dialogues[i])
    print()

Conversación # 1
<SOS> how are you ? <SOS> i m feeling great today . what did you do today ? i played a baskeball match this morning . how was it ? did you win ? yes and i scored 40 points . wow you did an amazing match did the sport afect your glucose ? yes my sugar level was low after lunch . did you inject too much insulin ? yes i didn t take into account the exercise . next time think about the exercise when you calculate the insulin dose . yes i will . <EOS> .

Conversación # 2
<EOS> .

Conversación # 3
<SOS> <SOS> how is it going ? great i had a very good blood glucose level today . that s nice keep it up having a good glucose level is great . thank you but it may rise this afternoon . why is that ? because i just ate a big sandwich now . and did you put your insulin ? yes but maybe it was not enough . then you should check your glucose in an hour and a half and correct the dose if necessary . thanks for your advice . no problem see <EOS> .



In [None]:
clean_dialogues_pd = pd.DataFrame(clean_dialogues)

clean_dialogues_pd.to_csv("/content/drive/MyDrive/TFM_Diabetes/data/clean_dialogues.csv", index = False)

### Análisis concreto de las palabras del texto: word embedding

In [None]:
def count_words(count_dict, text):
    '''Cuenta el número de ocurrencias de cada palabra en una frase del texto'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [None]:
# Saber el número de veces que cada palabra se ha usado. Conocer el tamaño del vocabulario.
word_counts = {}

count_words(word_counts, clean_dialoguesSW)
            
print("Tamaño del vocabulario:", len(word_counts))

In [None]:
import operator
sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True )

##### ***Word embedding***:  la forma de representación de los símbolos básicos

ConceptNet Numberbatch es un conjunto de vectores semánticos (**word embeddings**) que permite comparar los significados de las palabras de forma numérica.  Se han descargados directamente del [GitHub](https://github.com/commonsense/conceptnet-numberbatch) del proyecto Open Mind Common Sense.

In [None]:
embeddings_index = {}
with open('/content/drive/MyDrive/TFM/4. Abstraction summ (1)/ConceptNet/numberbatch-en.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

In [None]:
# Encontrar el número de palabras que no estan en CN y que se utilizan más de N veces (threshold)

#Las palabras que no estén en CN pueden añadirse a la matriz de word embeddings pero si son suficientemente comunes en los textos
missing_words = 0
threshold = 1

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            print(word)
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Número de palabras que no están en el CN:", missing_words)
print("Porcentaje de palabras que no están en el CN: {}%".format(missing_ratio))