In [92]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [93]:
import sys
sys.path.insert(0, '../src')

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from utils.exp_saver import ExpSaver

from data_loading.load_cas import load_cas
from data_loading.load_embeddings import load_vectors
from data_loading.load_neg_filters import load_filters

from data_processing.preprocessing import preprocess_data, map_text

from model.LSTM import embed_bilstm

from train.train import train
from utils.utils import format_results

In [98]:
import pandas as pd
pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',None)

import numpy as np
import glob

from gensim.models.fasttext import load_facebook_model, load_facebook_vectors

import pickle as pkl


In [95]:
ft_vec = load_facebook_vectors('../embeddings/cc.fr.300.bin')

In [156]:
def torch_predict(
    text,
    model_path,
    params,
    embeddings=None,
    ):
    
    n_fold = params['n_fold']
    epochs = params['epochs']
    batch_size = params['batch_size']
    cb_verbose = False
    hyper_params = params['hyper_params']
    model_structure = params['model_structure']

    scores = []
    models = []

    raw_sentences = map_text(
        text,
        embeddings=embeddings,
    )

    sentences = preprocess_data(
        raw_sentences,
        model_structure=model_structure,
        pad_sentences=(embeddings is None),
        return_sentences_only=True,
    )
    
    # see if possible to do it here or best to do it with each CV split
    max_sentence_size = sentences.shape[1]
    input_size = sentences.shape[-1] 

    sentences = np.squeeze(sentences)
    
    print("""
    ########################
    # Beginning prediction #
    ########################
    """)
    
    # Load model function
    with open(os.path.join(model_path, 'model_function.pkl'), 'rb') as f:
        model_function = pkl.load(f)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model = model_function(
        input_size=input_size,
        sentence_size=max_sentence_size,
        output_size=3,
        device=device,
        hyper_params=hyper_params,
    ).cuda(device)
    
    # Load model weights
    state_dict = torch.load(os.path.join(model_path, 'model.h5'), map_location=device)
                            
    module_names = [n[0] for n in model.named_modules()]
    existing_state_dict = {k: v for k, v in state_dict.items() if k.split('.')[0] in module_names}
    
    model.load_state_dict(existing_state_dict)

    print(model.name)
    # print(model)

    results = model.predict(
        sentences,
        batch_size=batch_size,
    )
    
    id2tag = {0: 'O', 1: 'B-NEG', 2: 'I-NEG'}

    labels = torch.max(results, -1)[1].cpu().numpy()
    labels = np.vectorize(id2tag.get)(labels)
    
    annot = []
    
    for i, sentence in enumerate(text):
        ann = []
        for sent, lab in zip(sentence, labels[i]):
            ann.append([sent, lab])
        annot.append(ann)
        
    return np.array(annot)

In [157]:
import ruamel.yaml
import spacy
from tqdm import tqdm_notebook as tqdm
import torch

def predict(
    data_path,
    model_path,
    embeddings_path=None,
    extensions=None,
    ft_vec=None,
):

    nlp = spacy.load("fr_core_news_sm")
    
    if os.path.isfile(data_path):
        with open(data_path, 'r') as f:
            text = f.read().split('.')
        text = np.array([np.array([n for n in nlp(t)]) for t in text])
        
    elif os.path.isdir(data_path):
        assert extensions is not None, "data_path is a directory but no extensions is defined"
        # consider all the files in the directory
        all_files = glob.glob(os.path.join(data_path, '|'.join([f'*.{ext}' for ext in extensions])))
        text = [nlp(open(f).read()) for f in tqdm(all_files)]
    else:
        "This should probably not happen but data_path is neither a file nor a directory"

    # Load params
    with open(os.path.join(model_path, 'config.yaml'), 'r') as f:
        params = ruamel.yaml.safe_load(f)
        
    if params.get("pretrainedEmbed") and not embeddings_path:
        print('Embeddings specified in model parameters but no embedding path specified, switching to no embeddings')
        params['pretrainedEmbed'] = False
        
#     if params['pretrainedEmbed']=='fasttext':
#         ft_vec = load_facebook_vectors(embeddings_path)
#     else:
#         ft_vec = None
    
    annot = torch_predict(
        text,
        model_path,
        params,
        embeddings=ft_vec,
        )
    
    # load params from config.yaml in model_path
    # load model from model.h5 in model_path
    
    return annot
    
annotation = predict(
    data_path='/home/taille/these/data/corpus_xavier/00001-W-44--1006755177289277516--873794080268159383-8548032564789574962.txt',
    model_path='/home/taille/these/logs/torch_BiLSTM___pretrainedEmbed+fasttext___label_level+word___neg_only+False___n_fold+10___batch_size+128___epochs+30___lr+0.01___lr_decay+1e-06___no_pos___no_cue/16012020_124030',
    embeddings_path='../embeddings/cc.fr.300.bin',
    extensions=['txt'],
    ft_vec=ft_vec,
)


    ########################
    # Beginning prediction #
    ########################
    
torch_BiLSTM


In [169]:
# write predictions

result_file = '/home/taille/these/predictions/results.conll'

with open(result_file, 'w') as f:
    for annot in annotation:
        for word, label in annot:
            if str(word).strip(' \n')!='':
                f.write(str(word) + ' ' + label + '\n')
        f.write('\n')