In [42]:
import numpy as np
import os
import json
import re
from gensim.models import KeyedVectors
import sqlite3
import string
from collections import Counter
import operator
import time

#### This notebook contains an algorithm to correct sentences. The method is a mix of Peter Norvig article (https://norvig.com/spell-correct.html) and word2vec

In [43]:
stop_words=["a","abord","absolument","afin","ah","ai","aie","aient","aies","ailleurs","ainsi","ait","allaient","allo","allons","allô","alors","anterieur","anterieure","anterieures","apres","après","as","assez","attendu","au","aucun","aucune","aucuns","aujourd","aujourd'hui","aupres","auquel","aura","aurai","auraient","aurais","aurait","auras","aurez","auriez","aurions","aurons","auront","aussi","autre","autrefois","autrement","autres","autrui","aux","auxquelles","auxquels","avaient","avais","avait","avant","avec","avez","aviez","avions","avoir","avons","ayant","ayez","ayons","b","bah","bas","basee","bat","beau","beaucoup","bien","bigre","bon","boum","bravo","brrr","c","car","ce","ceci","cela","celle","celle-ci","celle-là","celles","celles-ci","celles-là","celui","celui-ci","celui-là","celà","cent","cependant","certain","certaine","certaines","certains","certes","ces","cet","cette","ceux","ceux-ci","ceux-là","chacun","chacune","chaque","cher","chers","chez","chiche","chut","chère","chères","ci","cinq","cinquantaine","cinquante","cinquantième","cinquième","clac","clic","combien","comme","comment","comparable","comparables","compris","concernant","contre","couic","crac","d","da","dans","de","debout","dedans","dehors","deja","delà","depuis","dernier","derniere","derriere","derrière","des","desormais","desquelles","desquels","dessous","dessus","deux","deuxième","deuxièmement","devant","devers","devra","devrait","different","differentes","differents","différent","différente","différentes","différents","dire","directe","directement","dit","dite","dits","divers","diverse","diverses","dix","dix-huit","dix-neuf","dix-sept","dixième","doit","doivent","donc","dont","dos","douze","douzième","dring","droite","du","duquel","durant","dès","début","désormais","e","effet","egale","egalement","egales","eh","elle","elle-même","elles","elles-mêmes","en","encore","enfin","entre","envers","environ","es","essai","est","et","etant","etc","etre","eu","eue","eues","euh","eurent","eus","eusse","eussent","eusses","eussiez","eussions","eut","eux","eux-mêmes","exactement","excepté","extenso","exterieur","eûmes","eût","eûtes","f","fais","faisaient","faisant","fait","faites","façon","feront","fi","flac","floc","fois","font","force","furent","fus","fusse","fussent","fusses","fussiez","fussions","fut","fûmes","fût","fûtes","g","gens","h","ha","haut","hein","hem","hep","hi","ho","holà","hop","hormis","hors","hou","houp","hue","hui","huit","huitième","hum","hurrah","hé","hélas","i","ici","il","ils","importe","j","je","jusqu","jusque","juste","k","l","la","laisser","laquelle","las","le","lequel","les","lesquelles","lesquels","leur","leurs","longtemps","lors","lorsque","lui","lui-meme","lui-même","là","lès","m","ma","maint","maintenant","mais","malgre","malgré","maximale","me","meme","memes","merci","mes","mien","mienne","miennes","miens","mille","mince","mine","minimale","moi","moi-meme","moi-même","moindres","moins","mon","mot","moyennant","multiple","multiples","même","mêmes","n","na","naturel","naturelle","naturelles","ne","neanmoins","necessaire","necessairement","neuf","neuvième","ni","nombreuses","nombreux","nommés","non","nos","notamment","notre","nous","nous-mêmes","nouveau","nouveaux","nul","néanmoins","nôtre","nôtres","o","oh","ohé","ollé","olé","on","ont","onze","onzième","ore","ou","ouf","ouias","oust","ouste","outre","ouvert","ouverte","ouverts","o|","où","p","paf","pan","par","parce","parfois","parle","parlent","parler","parmi","parole","parseme","partant","particulier","particulière","particulièrement","pas","passé","pendant","pense","permet","personne","personnes","peu","peut","peuvent","peux","pff","pfft","pfut","pif","pire","pièce","plein","plouf","plupart","plus","plusieurs","plutôt","possessif","possessifs","possible","possibles","pouah","pour","pourquoi","pourrais","pourrait","pouvait","prealable","precisement","premier","première","premièrement","pres","probable","probante","procedant","proche","près","psitt","pu","puis","puisque","pur","pure","q","qu","quand","quant","quant-à-soi","quanta","quarante","quatorze","quatre","quatre-vingt","quatrième","quatrièmement","que","quel","quelconque","quelle","quelles","quelqu'un","quelque","quelques","quels","qui","quiconque","quinze","quoi","quoique","r","rare","rarement","rares","relative","relativement","remarquable","rend","rendre","restant","reste","restent","restrictif","retour","revoici","revoilà","rien","s","sa","sacrebleu","sait","sans","sapristi","sauf","se","sein","seize","selon","semblable","semblaient","semble","semblent","sent","sept","septième","sera","serai","seraient","serais","serait","seras","serez","seriez","serions","serons","seront","ses","seul","seule","seulement","si","sien","sienne","siennes","siens","sinon","six","sixième","soi","soi-même","soient","sois","soit","soixante","sommes","son","sont","sous","souvent","soyez","soyons","specifique","specifiques","speculatif","stop","strictement","subtiles","suffisant","suffisante","suffit","suis","suit","suivant","suivante","suivantes","suivants","suivre","sujet","superpose","sur","surtout","t","ta","tac","tandis","tant","tardive","te","tel","telle","tellement","telles","tels","tenant","tend","tenir","tente","tes","tic","tien","tienne","tiennes","tiens","toc","toi","toi-même","ton","touchant","toujours","tous","tout","toute","toutefois","toutes","treize","trente","tres","trois","troisième","troisièmement","trop","très","tsoin","tsouin","tu","té","u","un","une","unes","uniformement","unique","uniques","uns","v","va","vais","valeur","vas","vers","via","vif","vifs","vingt","vivat","vive","vives","vlan","voici","voie","voient","voilà","vont","vos","votre","vous","vous-mêmes","vu","vé","vôtre","vôtres","w","x","y","z","zut","à","â","ça","ès","étaient","étais","était","étant","état","étiez","étions","été","étée","étées","étés","êtes","être","ô"]
stop_words+=["»","«","''"," ","–"]
stop_words=set(stop_words)
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))

In [3]:
model= KeyedVectors.load_word2vec_format('../Database/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin', binary=True)

In [44]:
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
def remove_tag(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

def clean_text(text):
    text_without_tag=remove_tag(text)
    text_without_tag=''.join([i for i in text_without_tag if not i.isdigit()])
    text_split=text_without_tag.translate(table).lower().replace('\n', ' ').split(' ')
    text_without_tag=[i for i in text_split if i not in stop_words and i!=""]
    return text_without_tag

def get_counters(text):
    text_without_tag=remove_tag(text)
    text_without_tag=''.join([i for i in text_without_tag if not i.isdigit()])
    text_split=text_without_tag.translate(table).lower().replace('\n', ' ').split(' ')    
    counter_text=Counter(text_split)
    return counter_text

In [45]:
path='/home/gabriel/Documents/MPRI/Web_Data_Management/wikiextractor-master/text/'
 #map punctuation to space
voc_dict={}
count=0

for w,i in enumerate(os.listdir(path)):
    for j in os.listdir(path+i):
        for filename in os.listdir(path+i+'/'+j):
            with open(path+i+'/'+j+'/'+filename) as f:
                lines = [line.rstrip('\n') for line in f]
            for line_index,line in enumerate(lines):
                a=json.loads(line)
                if 'text' in a:
                    counter_text=get_counters(a['text'])
                    for key in counter_text.keys():
                        if key not in voc_dict:
                            voc_dict[key]=counter_text[key]
                        else:
                            voc_dict[key]+=counter_text[key]
                    count+=1
                if count==500000:
                    assert(True==False)

AssertionError: 

In [46]:
#computes the frequency of each word

length_voc=sum([value for key,value in voc_dict.items()])
for key in voc_dict.keys():
    voc_dict[key]/=length_voc

In [47]:
j = json.dumps(voc_dict)
f = open("../Database/voc_dict.json","w")
f.write(j)
f.close()
f = open('../Database/voc_dict.json')
voc_dict = json.load(f)
f.close()

In [60]:
get_correct_sentence("théorie des graphz")

['théorie'] ['graphz']
0.2107248306274414 0.0045588016510009766


'théorie des graphe'

In [76]:
print(get_correct_sentence("je veix arret de manger"))

['arret', 'manger'] ['veix']
0.10121321678161621 0.13947439193725586
je deux arret de manger


In [74]:
# function which modifies a word by doing insert, removal and swapping og characters
def modify_word(word):
    letters=list("abcdefghijklmnopqrstuvwxyzéèàêûöù")
    if type(word)==str:
        word_list=list(word)
        
    modify_1_letter=[]
    for i in range(0,len(word_list)):
        for j in range(len(letters)):
            tmp=list(word)
            tmp[i]=letters[j]
            modify_1_letter.append(tmp)

    swap_one_letter=[]
    for i in range(0,len(word_list)-1):
        tmp=list(word)
        a=word_list[i]
        tmp[i]=tmp[i+1]
        tmp[i+1]=a
        swap_one_letter.append(tmp)
    
    insert_one_letter=[]
    for i in range(0,len(word_list)+1):
        for j in range(len(letters)):
            tmp=list(word)
            tmp.insert(i,letters[j])
            insert_one_letter.append(tmp)
    
    missing_one_letter=[]
    for i in range(0,len(word_list)):
        tmp=list(word)
        del tmp[i]
        missing_one_letter.append(tmp)
    
    return [''.join(i) for i in modify_1_letter+swap_one_letter+insert_one_letter+missing_one_letter]

def distance(a,b):
    return np.mean((a.reshape(1,-1)-b.reshape(1,-1))**2)

def get_correct_sentence(sentence):
    path="/home/gabriel/Documents/MPRI/Web_Data_Management/search_engine/Database/"
    somme=lambda x:sum([i[1] for i in x])
    query_sentence_ori=sentence.lower().replace('\n', ' ').split(' ')
    # query_sentence=sentence.translate(table).lower().replace('\n', ' ').split(' ')
    query_sentence=clean_text(sentence)
    conn = sqlite3.connect(path+'Database.db')
    c = conn.cursor()
    well_spelled_words=[]
    mispelled_words=[]
    
    is_in=lambda x:c.execute("Select count(WORD) from Vocabulary where WORD='"+x+"';").fetchone()[0]>0
    for i in query_sentence:
        if is_in(i):
                well_spelled_words.append(i)
        else:
            mispelled_words.append(i)
    print(well_spelled_words,mispelled_words)
    if len(mispelled_words)==0:
        return ""
    
    w2v_query=np.mean(np.array([model[i] for i in well_spelled_words if i in model]),axis=0)
    for word in mispelled_words:
        a=time.time()
        l=[modify_word(word)]+[modify_word(i) for i in set(modify_word(word))]
        reco = set([item for sublist in l for item in sublist if item in voc_dict])
        dict1 = Counter(word) 
        reco_list=[]
        b=time.time()
        for i in reco:
            dict2 = Counter(i)
            commonDict = dict1 & dict2
            diff1=len(word)/np.sum([value for key,value in commonDict.items()])
            diff2=abs(len(word)-len(i))
            reco_list.append([i,voc_dict[i]*(1/diff1)*(1/(1+diff2))])
        
        reco_list=sorted(reco_list,key=lambda x:x[1],reverse=True)
        reco_list=[[i[0],float(i[1])/somme(reco_list)] for i in reco_list]
        w2v=[[i[0],distance(model[i[0]],w2v_query)] for i in reco_list if i[0] in model]
        w2v=[[i[0],i[1]/somme(w2v)] for i in w2v]
        both=sorted([[w2v[i][0],0.5*w2v[i][1]+0.5*reco_list[i][1]] for i in range(len(w2v))],key=lambda x:x[1],reverse=True)
        if len(both)>0:
            query_sentence_ori=[i if i!=word else both[0][0] for i in query_sentence_ori]
        c=time.time()
        print(b-a,c-b)
    conn.close()
    return ' '.join(query_sentence_ori)