## Nouvelle suggestion métier / rome
- utilisant la lib whoosh
- stemming, accent folding
- fuzzy search, ...

#### idées
- higlighting du matching ?
- query expansion / more like this ?

In [70]:
import os
import sys
import logging
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, STORED, KEYWORD
from whoosh.analysis import CharsetFilter, StemmingAnalyzer
from whoosh.support.charset import accent_map
from whoosh.qparser import QueryParser, FuzzyTermPlugin
from whoosh.query import FuzzyTerm
from whoosh import scoring

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

def createTable(name):
    analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
    schema = Schema(
        label=TEXT(stored=True, analyzer=analyzer),
        rome=STORED,
        source=KEYWORD
        )
    if not os.path.exists(name):
        os.mkdir(name)
    
    idx = create_in(name, schema);
    return name


def writeRecord(idx, **fields):
    writer = idx.writer();
    writer.add_document(**fields);
    
def writeDataframe(idx_name, df):
    # convert dataframe to row of stuff
    idx = open_dir(idx_name)
    writer = idx.writer()
    try:
        logger.info("Writing dataframe to index")
        for _, row in df.iterrows():
            print(row.to_dict())
            writer.add_document(**row.to_dict())
        # write row of stuff to index
        writer.commit()
        logger.info("Done writing dataframe to index")
        print('done')
    except:
        writer.cancel()
        raise RuntimeError('Failed writing')
        
def get_index(idx_name):
    return open_dir(idx_name)

class FuzzyConfig(FuzzyTerm):
     def __init__(self, fieldname, text, boost=1.0, maxdist=10, prefixlength=1, constantscore=True):
         super(FuzzyConfig, self).__init__(fieldname, text, boost, maxdist, prefixlength, constantscore)
        
def match(idx, query_str, limit=40):
    results = []
    with idx.searcher() as searcher:
        parser = QueryParser('label', idx.schema, termclass=FuzzyConfig)
        parser.add_plugin(FuzzyTermPlugin())
        query = parser.parse(query_str)
        for r in searcher.search(query,limit=limit):
            results.append([r['label'], r['rome']])
    return results
    

In [26]:
import pandas as pd

data = [
    ['A1298', 'Sourcier', 'rome'],
    ['B9812', 'Abricotier', 'arbre'],
]
df = pd.DataFrame(data, columns=['rome', 'label', 'source'])
df

Unnamed: 0,rome,label,source
0,A1298,Sourcier,rome
1,B9812,Abricotier,arbre


In [3]:
idx = createTable('index_dir')

In [31]:
writeDataframe(idx, df)

{'rome': 'A1298', 'label': 'Sourcier', 'source': 'rome'}
{'rome': 'B9812', 'label': 'Abricotier', 'source': 'arbre'}
done


In [76]:
res = match(get_index(idx), "abico")
res

[['Abricotier', 'B9812'],
 ['Abricotier', 'B9812'],
 ['Abricotier', 'B9812'],
 ['Abricotier', 'B9812'],
 ['Abricotier', 'B9812']]