# TP1 - Text Mining and Chatbots

Name: Jiangnan Huang and You Zuo

Date: 14/01/2020

## pre-processing

In [1]:
# libraries required
import numpy as np 
import re

import gensim
from gensim.models import Word2Vec
from gensim.models import FastText

In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    """
    string = re.sub(r"[!?,.\'\`]", " ", string)   
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()


# reads the content of the file passed as an argument.
# if limit > 0, this function will return only the first "limit" sentences in the file.
def loadTexts(filename, limit=-1):
    f = open(filename, "r", encoding='utf-8')
    dataset=[]
    line =  f.readline()
    cpt=1
    skip=0
    while line :
        cleanline = clean_str(line).split()
        if cleanline: 
            dataset.append(cleanline)
        else: 
            line = f.readline()
            skip+=1
            continue
        if limit > 0 and cpt >= limit: 
            break
        line = f.readline()
        cpt+=1        
        
    f.close()
    print("Load ", cpt, " lines from ", filename , " / ", skip ," lines discarded")
    return dataset

In [3]:
filename1 = 'TP_ISD2020/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl'
filename2 = 'TP_ISD2020/QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl'

data1 = loadTexts(filename1)
data2 = loadTexts(filename2)

Load  3022  lines from  TP_ISD2020/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl  /  70  lines discarded
Load  38547  lines from  TP_ISD2020/QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl  /  2  lines discarded


## Word embeddings training

The first step of the work is to create python and bash scripts to train the different embeddings approaches: word2vec (Cbow, skipgram) and fasttext (Cbow), on the two medical and non-medical corpora, resulting to 6 embeddings models.

### Word2Vec

In [4]:
## word2vec: CBow

### parameters:
### sg = 0 for cbow, 1 for skipgram
### min_count: ignores all words with total frequency lower than this.
### size: dimension of the word vector
### window: maximum distance between the current and predicted word within a sentence

## cbow trained on data1
w2v_cbow1 = Word2Vec(sg=0, min_count=1, size=100, window=10)
w2v_cbow1.build_vocab(sentences=data1)
w2v_cbow1.train(data1,total_examples=w2v_cbow1.corpus_count, epochs=20)

(687560, 933100)

In [5]:
## cbow trained on data2
w2v_cbow2 = Word2Vec(sg=0, min_count=1, size=100, window=10)
w2v_cbow2.build_vocab(sentences=data2)
w2v_cbow2.train(data2,total_examples=w2v_cbow2.corpus_count, epochs=20)

(16747921, 22751300)

In [6]:
# word2vec: skipgram
## skipgram trained on data1
w2v_sg1 = Word2Vec(sg=1, min_count=1, size=100, window=10)
w2v_sg1.build_vocab(sentences=data1)
w2v_sg1.train(data1,total_examples=w2v_sg1.corpus_count, epochs=20)

(687615, 933100)

In [7]:
## skipgram trained on data2
w2v_sg2 = Word2Vec(sg=1, min_count=1, size=100, window=10)
w2v_sg2.build_vocab(sentences=data2)
w2v_sg2.train(data2,total_examples=w2v_sg2.corpus_count, epochs=20)

(16749108, 22751300)

### FastText

In [8]:
# with the same hyper-paramters with word2vec
fasttext1 = FastText(data1, min_count=1, size=100, window=10)

In [9]:
fasttext2 = FastText(data2, min_count=1, size=100, window=10)

In [13]:
# save six models

w2v_cbow1.save("models/cbow_medical.model")
w2v_cbow2.save("models/cbow_non-medical.model")
w2v_sg1.save("models/skipgram_medical.model")
w2v_sg2.save("models/skipgram_non-medical.model")
fasttext1.save("models/fasttext_medical.model")
fasttext2.save("models/fasttext_non-medical.model")

## Semantic similarity

The second step is to find the closest words to a given word based on the cosine similarity calculation.

### Evaluation on the same corpus

In [14]:
# the candidate word list
test_words = ['patient', 'traitement', 'maladie', 'solution', 'jaune']

In [25]:
cbow_medical = Word2Vec.load('models/cbow_medical.model')
skipgram_medical = Word2Vec.load('models/skipgram_medical.model')
fasttext_medical = FastText.load('models/fasttext_medical.model')

for word in test_words:
    print(word+':')
    print(cbow_medical.wv.most_similar(word)[0])
    print(skipgram_medical.wv.most_similar(word)[0])
    print(fasttext_medical.wv.most_similar(word)[0],'\n')

patient:
('symptômes', 0.9948393106460571)
('carte', 0.8920832872390747)
('patiente', 0.9999991655349731) 

traitement:
('expérience', 0.9797210693359375)
('confirmé', 0.7372004985809326)
('taaitement', 0.9999992251396179) 

maladie:
('infection', 0.986403226852417)
('liée', 0.8259382247924805)
('maladies', 0.9999959468841553) 

solution:
('conditionnés', 0.9839076399803162)
('réchauffer', 0.910857617855072)
('dissolution', 0.9999986886978149) 

jaune:
('capsule', 0.9976205229759216)
('blanc', 0.878764271736145)
('une', 0.9999926090240479) 



In [26]:
cbow_non_medical = Word2Vec.load('models/cbow_non-medical.model')
skipgram_non_medical = Word2Vec.load('models/skipgram_non-medical.model')
fasttext_non_medical = FastText.load('models/fasttext_non-medical.model')

for word in test_words:
    print(word+':')
    print(cbow_non_medical.wv.most_similar(word)[0])
    print(skipgram_non_medical.wv.most_similar(word)[0])
    print(fasttext_non_medical.wv.most_similar(word)[0],'\n')

patient:
('mototaxi', 0.5873354077339172)
('hospitalisé', 0.6993711590766907)
('contient', 0.9885638356208801) 

traitement:
('système', 0.7244834899902344)
('médicamenteux', 0.5807715654373169)
('farouchement', 0.9810761213302612) 

maladie:
('perte', 0.5813629627227783)
('neurologique', 0.6623693108558655)
('trilogie', 0.9433014988899231) 

solution:
('alternative', 0.6969314813613892)
('lancinant', 0.6705160140991211)
('résolution', 0.9928999543190002) 

jaune:
('maillot', 0.883590817451477)
('maillot', 0.8162438273429871)
('lune', 0.9673931002616882) 



### Evaluation on different corpora

In [27]:
cbow_medical = Word2Vec.load('models/cbow_medical.model')
cbow_non_medical = Word2Vec.load('models/cbow_non-medical.model')

for word in test_words:
    print(word+':')
    print(cbow_medical.wv.most_similar(word)[0])
    print(cbow_non_medical.wv.most_similar(word)[0],'\n')

patient:
('symptômes', 0.9948393106460571)
('mototaxi', 0.5873354077339172) 

traitement:
('expérience', 0.9797210693359375)
('système', 0.7244834899902344) 

maladie:
('infection', 0.986403226852417)
('perte', 0.5813629627227783) 

solution:
('conditionnés', 0.9839076399803162)
('alternative', 0.6969314813613892) 

jaune:
('capsule', 0.9976205229759216)
('maillot', 0.883590817451477) 



In [28]:
skipgram_medical = Word2Vec.load('models/skipgram_medical.model')
skipgram_non_medical = Word2Vec.load('models/skipgram_non-medical.model')

for word in test_words:
    print(word+':')
    print(skipgram_medical.wv.most_similar(word)[0])
    print(skipgram_non_medical.wv.most_similar(word)[0],'\n')

patient:
('carte', 0.8920832872390747)
('hospitalisé', 0.6993711590766907) 

traitement:
('confirmé', 0.7372004985809326)
('médicamenteux', 0.5807715654373169) 

maladie:
('liée', 0.8259382247924805)
('neurologique', 0.6623693108558655) 

solution:
('réchauffer', 0.910857617855072)
('lancinant', 0.6705160140991211) 

jaune:
('blanc', 0.878764271736145)
('maillot', 0.8162438273429871) 



In [29]:
fasttext_medical = Word2Vec.load('models/fasttext_medical.model')
fasttext_non_medical = Word2Vec.load('models/fasttext_non-medical.model')

for word in test_words:
    print(word+':')
    print(fasttext_medical.wv.most_similar(word)[0])
    print(fasttext_non_medical.wv.most_similar(word)[0],'\n')

patient:
('patiente', 0.9999991655349731)
('contient', 0.9885638356208801) 

traitement:
('taaitement', 0.9999992251396179)
('farouchement', 0.9810761213302612) 

maladie:
('maladies', 0.9999959468841553)
('trilogie', 0.9433014988899231) 

solution:
('dissolution', 0.9999986886978149)
('résolution', 0.9928999543190002) 

jaune:
('une', 0.9999926090240479)
('lune', 0.9673931002616882) 

