# TP1 - Text Mining and Chatbots

Name: Jiangnan Huang and You Zuo

Date: 14/01/2020

## pre-processing

In [1]:
# libraries required
import numpy as np 
import re

import gensim
from gensim.models import Word2Vec
from gensim.models import FastText

In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    """
    string = re.sub(r"[!?,.\'\`]", " ", string)   
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()


# reads the content of the file passed as an argument.
# if limit > 0, this function will return only the first "limit" sentences in the file.
def loadTexts(filename, limit=-1):
    f = open(filename, "r", encoding='utf-8')
    dataset=[]
    line =  f.readline()
    cpt=1
    skip=0
    while line :
        cleanline = clean_str(line).split()
        if cleanline: 
            dataset.append(cleanline)
        else: 
            line = f.readline()
            skip+=1
            continue
        if limit > 0 and cpt >= limit: 
            break
        line = f.readline()
        cpt+=1        
        
    f.close()
    print("Load ", cpt, " lines from ", filename , " / ", skip ," lines discarded")
    return dataset

In [3]:
filename1 = '../TP_ISD2020/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl'
filename2 = '../TP_ISD2020/QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl'

data1 = loadTexts(filename1)
data2 = loadTexts(filename2)

Load  3022  lines from  ../TP_ISD2020/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl  /  70  lines discarded
Load  38547  lines from  ../TP_ISD2020/QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl  /  2  lines discarded


## Word embeddings training

The first step of the work is to create python and bash scripts to train the different embeddings approaches: word2vec (Cbow, skipgram) and fasttext (Cbow), on the two medical and non-medical corpora, resulting to 6 embeddings models.

### Word2Vec

In [4]:
## word2vec: CBow

### parameters:
### sg = 0 for cbow, 1 for skipgram
### min_count: ignores all words with total frequency lower than this.
### size: dimension of the word vector
### window: maximum distance between the current and predicted word within a sentence

## cbow trained on data1
w2v_cbow1 = Word2Vec(sg=0, min_count=1, size=100, window=10)
w2v_cbow1.build_vocab(sentences=data1)
w2v_cbow1.train(data1,total_examples=w2v_cbow1.corpus_count, epochs=20)

(687391, 933100)

In [5]:
## cbow trained on data2
w2v_cbow2 = Word2Vec(sg=0, min_count=1, size=100, window=10)
w2v_cbow2.build_vocab(sentences=data2)
w2v_cbow2.train(data2,total_examples=w2v_cbow2.corpus_count, epochs=20)

(16750313, 22751300)

In [6]:
# word2vec: skipgram
## skipgram trained on data1
w2v_sg1 = Word2Vec(sg=1, min_count=1, size=100, window=10)
w2v_sg1.build_vocab(sentences=data1)
w2v_sg1.train(data1,total_examples=w2v_sg1.corpus_count, epochs=20)

(687479, 933100)

In [7]:
## skipgram trained on data2
w2v_sg2 = Word2Vec(sg=1, min_count=1, size=100, window=10)
w2v_sg2.build_vocab(sentences=data2)
w2v_sg2.train(data2,total_examples=w2v_sg2.corpus_count, epochs=20)

(16749052, 22751300)

### FastText

In [8]:
# with the same hyper-paramters with word2vec
fasttext1 = FastText(data1, min_count=1, size=100, window=10)

In [9]:
fasttext2 = FastText(data2, min_count=1, size=100, window=10)

In [10]:
# save six models

w2v_cbow1.save("models/cbow_medical.model")
w2v_cbow2.save("models/cbow_non-medical.model")
w2v_sg1.save("models/skipgram_medical.model")
w2v_sg2.save("models/skipgram_non-medical.model")
fasttext1.save("models/fasttext_medical.model")
fasttext2.save("models/fasttext_non-medical.model")

## Semantic similarity

The second step is to find the closest words to a given word based on the cosine similarity calculation.

### Evaluation on the same corpus

In [11]:
# the candidate word list
test_words = ['patient', 'traitement', 'maladie', 'solution', 'jaune']

In [12]:
cbow_medical = Word2Vec.load('models/cbow_medical.model')
skipgram_medical = Word2Vec.load('models/skipgram_medical.model')
fasttext_medical = FastText.load('models/fasttext_medical.model')

for word in test_words:
    print(word+':')
    print(cbow_medical.wv.most_similar(word)[0])
    print(skipgram_medical.wv.most_similar(word)[0])
    print(fasttext_medical.wv.most_similar(word)[0],'\n')

patient:
('symptômes', 0.994620680809021)
('carte', 0.8950115442276001)
('patiente', 0.9999990463256836) 

traitement:
('expérience', 0.9798558354377747)
('six', 0.7178391218185425)
('taaitement', 0.9999990463256836) 

maladie:
('aggravation', 0.9832441806793213)
('recklinghausen', 0.8165718913078308)
('malade', 0.9999963045120239) 

solution:
('conditionnés', 0.9851862192153931)
('réchauffer', 0.9186955690383911)
('dissolution', 0.9999988079071045) 

jaune:
('initial', 0.9970934391021729)
('blanc', 0.8862963318824768)
('une', 0.9999927878379822) 



In [13]:
cbow_non_medical = Word2Vec.load('models/cbow_non-medical.model')
skipgram_non_medical = Word2Vec.load('models/skipgram_non-medical.model')
fasttext_non_medical = FastText.load('models/fasttext_non-medical.model')

for word in test_words:
    print(word+':')
    print(cbow_non_medical.wv.most_similar(word)[0])
    print(skipgram_non_medical.wv.most_similar(word)[0])
    print(fasttext_non_medical.wv.most_similar(word)[0],'\n')

patient:
('flagrant', 0.5910066366195679)
('soignant', 0.7369797229766846)
('patientent', 0.9877076745033264) 

traitement:
('système', 0.7374351620674133)
('médicamenteux', 0.5933451652526855)
('promptement', 0.9832788705825806) 

maladie:
('douleur', 0.6258751153945923)
('dingue', 0.6802621483802795)
('malnutrie', 0.9487839937210083) 

solution:
('alternative', 0.6906707286834717)
('lancinant', 0.626539409160614)
('révolution', 0.9928566217422485) 

jaune:
('maillot', 0.8880891799926758)
('maillot', 0.7969251871109009)
('lune', 0.9644564986228943) 



### Evaluation on different corpora

In [14]:
cbow_medical = Word2Vec.load('models/cbow_medical.model')
cbow_non_medical = Word2Vec.load('models/cbow_non-medical.model')

for word in test_words:
    print(word+':')
    print(cbow_medical.wv.most_similar(word)[0])
    print(cbow_non_medical.wv.most_similar(word)[0],'\n')

patient:
('symptômes', 0.994620680809021)
('flagrant', 0.5910066366195679) 

traitement:
('expérience', 0.9798558354377747)
('système', 0.7374351620674133) 

maladie:
('aggravation', 0.9832441806793213)
('douleur', 0.6258751153945923) 

solution:
('conditionnés', 0.9851862192153931)
('alternative', 0.6906707286834717) 

jaune:
('initial', 0.9970934391021729)
('maillot', 0.8880891799926758) 



In [15]:
skipgram_medical = Word2Vec.load('models/skipgram_medical.model')
skipgram_non_medical = Word2Vec.load('models/skipgram_non-medical.model')

for word in test_words:
    print(word+':')
    print(skipgram_medical.wv.most_similar(word)[0])
    print(skipgram_non_medical.wv.most_similar(word)[0],'\n')

patient:
('carte', 0.8950115442276001)
('soignant', 0.7369797229766846) 

traitement:
('six', 0.7178391218185425)
('médicamenteux', 0.5933451652526855) 

maladie:
('recklinghausen', 0.8165718913078308)
('dingue', 0.6802621483802795) 

solution:
('réchauffer', 0.9186955690383911)
('lancinant', 0.626539409160614) 

jaune:
('blanc', 0.8862963318824768)
('maillot', 0.7969251871109009) 



In [16]:
fasttext_medical = Word2Vec.load('models/fasttext_medical.model')
fasttext_non_medical = Word2Vec.load('models/fasttext_non-medical.model')

for word in test_words:
    print(word+':')
    print(fasttext_medical.wv.most_similar(word)[0])
    print(fasttext_non_medical.wv.most_similar(word)[0],'\n')

patient:
('patiente', 0.9999990463256836)
('patientent', 0.9877076745033264) 

traitement:
('taaitement', 0.9999990463256836)
('promptement', 0.9832788705825806) 

maladie:
('malade', 0.9999963045120239)
('malnutrie', 0.9487839937210083) 

solution:
('dissolution', 0.9999988079071045)
('révolution', 0.9928566217422485) 

jaune:
('une', 0.9999927878379822)
('lune', 0.9644564986228943) 

