In [2]:
'''
import pretrained vector-space model
'''

#this model is Facebook's FastText (skip-gram)
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/antoi/Documents/UDEM/Article2/ID_L2/vecModels/wiki.fr.vec', binary=False)

'''
Import and tokenize
'''

import os
import nltk
from unidecode import unidecode #pour convertir les espaces insecables en caracteres ASCII
import codecs #pour un module qui tolere les accents?
from nltk import word_tokenize
from pprint import pprint
from nltk.corpus import stopwords #il existe aussi les stopwords Veronicus*, qui seraient à vérifier parce que cette liste-ci est sévère
import numpy
import pandas
from itertools import combinations, islice



In [3]:
#définitions des fonctions pour la sliding window de distance cosine qui itère sur les listes de mots

def sliding_window(seq, n):
    """
    Returns a sliding window (of width n) over data from the iterable
    s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
    """
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result
def window_distance(wordVector, window):
    """
    Params:
        wordVector: your word vector model
        window: a list of words
    Returns:
        a numpy array with all the possible similarities between each 2 words
        a set of words missing from the word vector space
    """
    rsl = []
    missingWords = set()
    for comb in combinations(window, 2):
        try:
            rsl.append(wordVector.wv.distance(comb[0], comb[1])) #could switch from similarity to distance
        except KeyError:
            rsl.append(numpy.nan)
            # Find missing word from the word vector space
            for w in comb:
                try:
                    wordVector.wv.distance(w, w)
                except:
                    missingWords.add(w)
    return numpy.array(rsl), missingWords
def sentence_distance(wordVector, sentence, windowSize):
    """
    Params:
        wordVector: your word vector model
        sentence: a list of words tokenized
        windowSize: size of the sliding window
    Returns:
        a numpy array with all the possible similarities of each windows
        a set of words missing from the word vector space
    """
    rsl = []
    missingWords = set()
    for window in sliding_window(sentence, windowSize):
        rslWindow, miss = window_distance(wordVector, window)
        rsl.append(rslWindow)
        for w in miss:
            missingWords.add(w)
    return numpy.array(rsl), missingWords

In [4]:
pathToData = os.path.join('C:/', 'Users', 'antoi', 'Documents' , 'UDEM', 'Article2', 'ID_L2', 'data')
os.chdir(pathToData)
fullSpeech = {}
for i in os.listdir(r'C:\Users\antoi\Documents\UDEM\Article2\ID_L2\data'):
    if i.endswith('.txt'):
        with open(i, mode='r', encoding='utf-8-sig') as j:
            lireSansTiret = j.read().replace('-', ' ') #on remplace manuellement le tiret de 'pique-nique' et 'cerf-volant' parce que notre modele ne prend pas les mots-composes 
            Tokens = word_tokenize(lireSansTiret) # tokenizer de base de NLTK
            TokensASCII = [unidecode(word) for word in Tokens] # retire le mystérieux espace insécable, mais aussi tous les accents
            tokens = [word.lower() for word in TokensASCII] # probablement deja en lowercase dans le modele
            punctuation = [",", "?", "!", ".", "’","'", "..."] # ce n'est pas l'idéal, mais on vire toute la ponctuation manuellement
            sample = [m for m in tokens if m not in punctuation] # on retire les signes de poncutation de la liste de strings
            fullSpeech[i] = sample

In [5]:
WINDOW_SIZE = 50
csvRowList = []
for i, sample in enumerate(fullSpeech):
    dist, e = sentence_distance(model, fullSpeech[sample], WINDOW_SIZE)
    row = {
            "name": "patient{}".format(sample), 
            "distance": numpy.nanmean(dist), 
            }
    csvRowList.append(row)

df = pandas.DataFrame(csvRowList)
df = df[["name", "distance"]]
outputFilename = "distance.csv"
df.to_csv(outputFilename, sep=",")


