#  FIR Final Project

In [4]:
!pip uninstall -y gensim
!pip3 install gensim
!pip install nltk

Collecting gensim
  Downloading gensim-4.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[K     |████████████████████████████████| 26.6 MB 2.7 MB/s eta 0:00:01     |████████████▍                   | 10.3 MB 3.3 MB/s eta 0:00:05     |██████████████                  | 11.6 MB 727 kB/s eta 0:00:21     |████████████████████████▉       | 20.6 MB 4.6 MB/s eta 0:00:02
Collecting scipy>=1.7.0
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[K     |████████████████████████████████| 34.5 MB 9.5 kB/s eta 0:00:01    |▍                               | 389 kB 2.7 MB/s eta 0:00:13     |███████████▉                    | 12.8 MB 6.3 MB/s eta 0:00:04     |██████████████████▌             | 20.0 MB 4.4 MB/s eta 0:00:04
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-6.4.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 2.7 MB/s eta 0:00:01
[31mERROR: scipy 1.10.1 has requirement numpy<1.27.0,>=1.1

In [1]:
elastic = True
training = False

In [70]:
import re
import json
import nltk
if elastic:
    import elasticsearch
    import elasticsearch.helpers

from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from itertools import islice
import numpy as np
import math

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ut-student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Elastic Search - Text similarity search with vector fields
In Elasticsearch 7.0, we introduced experimental field types for high-dimensional vectors, and now the 7.3 release brings support for using these vectors in document scoring.
We could use text embeddings to allow for retrieving similar questions:
- During indexing, each question is run through a sentence embedding model to produce a numeric vector.
- When a user enters a query, it is run through the same sentence embedding model to produce a vector. To rank the responses, we calculate the vector similarity between each question and the query vector. When comparing embedding vectors, it is common to use cosine similarity.

For this we need to create the Elasticsearch index, which includes mappings for the properties (title, question, tags, etc) encoded as a vector.

**source:** https://www.elastic.co/search-labs/text-similarity-search-with-vectors-in-elasticsearch

In [3]:
def read_documents(file_name):
    """
    Returns a generator of documents to be indexed by elastic, read from file_name
    """
    with open(file_name, 'r') as documents:
        for line in documents:
            #for index in range(2000):  # change this to retrive more than 1000 docs
            #line = next(documents)
            doc_line = json.loads(line)
            if ('index' in doc_line):
                id = doc_line['index']['_id']
            elif ('PMID' in doc_line):
                doc_line['_id'] = id
                yield doc_line
            else:
                raise ValueError('Woops, error in index file')

def create_index(es, index_name, body={}):
    # delete index when it already exists
    es.indices.delete(index=index_name, ignore=[400, 404])
    # create the index 
    es.indices.create(index=index_name, body=body)
                
def index_documents(es, collection_file_name, index_name, body={}):
    create_index(es, index_name, body)
    # bulk index the documents from file_name
    return elasticsearch.helpers.bulk(
        es, 
        read_documents(collection_file_name),
        index=index_name,
        chunk_size=2000,
        request_timeout=30
    )


In [17]:
# Connect to the ElasticSearch server
if elastic:
    es = elasticsearch.Elasticsearch(host='localhost', timeout=40)
    body = {
               "mappings": {
                    "properties": {
                        "title-abstract-vector": {
                            "type": "dense_vector",
                            "dims": 300,
                            "index": "true",
                            "similarity": "cosine"
                        }
                    }
                }
            }
    # Index the collection into the index called 'genomics-word2vec'
    index_name = 'genomics-word2vec'
    index_documents(es, 'data01/FIR-s05-medline.json', 'genomics-word2vec', body)



### We want to user Word2Vec representation to retrive the documents for every query. 
We need to have an embedding for every title/abstract from our dataset. For this, we need to train the word2vec model using our title/abstract data from the dataset. That's why we retrieved all of them from elastic and our sentences for training the word2vec model are concatenated title and abstract for every item.

Word2Vec documentation: https://rare-technologies.com/word2vec-tutorial/

In [18]:
def get_all_titles_abstract_from_elastic():
    query = {
              "size": 1000,
              "_source": {
                "include": [
                  "AB", "TI", "title-abstract-vector"
                ]
              },
              "query": {
                "match_all": {}
              }
        }

    resp = es.search(index=index_name, body=query, scroll='1m')

    print('Total %d hits found.' % resp["hits"]["total"]["value"])

    results = resp['hits']['hits']
    max_length = resp['hits']["total"]["value"]
    while len(results) < max_length:
        resp = es.scroll(scroll_id=resp['_scroll_id'], scroll="1m")
        results += resp['hits']['hits']

    return results

In [20]:
if elastic:
    sentences = get_all_titles_abstract_from_elastic()



Total 263080 hits found.


### This is the retriving from disk version of the above code.

In [16]:
def get_all_titles_abstract_from_disk(input_file_path):
    data = []
    ids = []

    with open(input_file_path) as f:
        for line in tqdm(f):
            # json_line = json.loads(line)
            if '_id' in line:
                ids.append(json.loads(line)['index']['_id'])
            if 'TI' in line:
                data.append(json.loads(line))

    data = [elem['AB']+' '+elem['TI'] for elem in data if 'AB' in list(elem.keys())]

    return data, ids

In [17]:
if not elastic:
    input_file_path = 'FIR-s05-medline.json'
    sentences, ids = get_all_titles_abstract_from_disk(input_file_path)

In [21]:
if elastic:
    documents = {elem['_id'] : elem['_source']['AB']+' '+elem['_source']['TI'] for elem in sentences if 'AB' in list(elem['_source'].keys())}
else:
    documents = {ids[i] : elem for i,elem in enumerate(sentences)}
print(documents['3'])

The global fold of maltose binding protein in complex with beta-cyclodextrin has been determined using a CNS-based torsion angle molecular dynamics protocol involving direct refinement against dipolar couplings and carbonyl chemical shift changes that occur upon alignment. The shift changes have been included as structural restraints using a new module, CANI, that has been incorporated into CNS. Force constants and timesteps have been determined that are particularly effective in structure refinement applications involving high molecular weight proteins with small to moderate numbers of NOE restraints. Solution structures of the N Direct structure refinement of high molecular weight proteins against residual dipolar couplings and carbonyl chemical shift changes upon alignment: an application to maltose binding protein.


## Preprocess Our Data
Prepare your text data by tokenizing and cleaning it.We need only the abstract and the title, so only the 'AB' and 'TI' keys are retrieved.

In [22]:
if elastic:
    sentences = [elem['_source']['AB']+' '+elem['_source']['TI'] for elem in sentences if 'AB' in list(elem['_source'].keys())]

In [18]:
if training:
    w2v_sentences = []
    for s in sentences:
        for sent in sent_tokenize(s):
            w2v_sentences.append(word_tokenize(sent.lower()))
    print(w2v_sentences[0])

### Training Word2Vec model using **Gensim**
We will save the model in 'w2v_genomics_model.bin' so we can skip this step and use the loaded version of it. We are training the model with the parameters from this paper: Section 4.1.4 - https://dl.acm.org/doi/10.1145/3476415.3476433 

In [19]:
# Train a word2vec model
if training:
    w2v_genomics = Word2Vec(workers=8, min_count=10, window=10, vector_size=300)
    w2v_genomics.build_vocab(w2v_sentences)
    w2v_genomics.train(w2v_sentences, total_examples=w2v_genomics.corpus_count, epochs=10)

    w2v_genomics.save('w2v_genomics_model.bin')

In [23]:
if not training:
    w2v_genomics = Word2Vec.load('w2v_genomics_model.bin')

In [14]:
w2v_genomics.wv['dipolar'][0]

0.16212283

###  Every document embbeding
We calculate this as the average of embbedings of every word of a document

In [24]:
doc_embeddings = {}
for (key, value) in documents.items():
    word_vector = [0] * 300
    words_length = 0

    for word in word_tokenize(value):
        if word in w2v_genomics.wv:
            words_length += 1
            embd = w2v_genomics.wv[word]
            for index in range(300):
                word_vector[index] += embd[index]

    for index in range(300):
        if words_length != 0:
            word_vector[index] /= words_length

    doc_embeddings[key] = word_vector

#len(doc_embeddings['3'])

300

### This is a step only for elastic approach
Now we need to update the **title-abstract-vector** field with the embbeding values we computed in the above cell.

In [30]:
if elastic:
    for key, value in doc_embeddings.items():
        document = {"doc": {"title-abstract-vector": value}}
        es.update(index=index_name, id=int(key), body=document)

TransportError: TransportError(500, 'i_o_exception', 'No space left on device')

In [25]:
def get_first_item(index_name, es, doc_id):
    result = es.get(index=index_name, id=doc_id)
    return result['_source']

In [26]:
if elastic:
    print(get_first_item(index_name, es, 3))

{'AB': 'The global fold of maltose binding protein in complex with beta-cyclodextrin has been determined using a CNS-based torsion angle molecular dynamics protocol involving direct refinement against dipolar couplings and carbonyl chemical shift changes that occur upon alignment. The shift changes have been included as structural restraints using a new module, CANI, that has been incorporated into CNS. Force constants and timesteps have been determined that are particularly effective in structure refinement applications involving high molecular weight proteins with small to moderate numbers of NOE restraints. Solution structures of the N', 'AD': 'Protein Engineering Network Center of Excellence and Department of Medical Genetics and Microbiology, University of Toronto, Ontario, Canada.', 'CY': 'Netherlands', 'DA': '20011105', 'DCOM': '20020401', 'DP': '2001 Sep', 'EDAT': '2001/11/06 10:00', 'IP': '1', 'IS': '0925-2738', 'JID': '9110829', 'LA': 'eng', 'LR': '20021101', 'MHDA': '2002/04

### The final step
Based on the **title-abstract-vector** we can calculate similarity using cosineSImilarity. The source property contains a *painless* script that checks if the document has a dense vector field and computes the similarity between the doc embbeding and the query embbeding. 

We also can use for calculating the similarity these metrics (https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html):
##### Elastic approach
- dot product
- l2 norm

##### Disk approach
- any metric that make sense :)

In [39]:
def calculate_query_embbeding(query):
    query_vector = [0] * 300

    for word in word_tokenize(query):
        word_vector = w2v_genomics.wv[word]
        for i in range(300):
            query_vector += word_vector[i]
    query_vector = [x/len(query) for x in query_vector]

    return query_vector

In [40]:
def find_similar_doc_with_query(query, index_name, es, top):  # elastic approach
    query_vector = calculate_query_embbeding(query)

    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
              "source": "doc['title-abstract-vector'].size() == 0 ? 0 : cosineSimilarity(params.query_vector, 'title-abstract-vector') + 1",
              "params": {"query_vector": query_vector}
            }
        }
    }
    body = {
        "size": top,
        "query": script_query
    }
    res = es.search(index=index_name, body=body)
    # print(len(res['hits']['hits']))
    return res['hits']['hits'][:top]

In [41]:
def find_similar_doc_with_query_disk(query, top):
    query_vector = calculate_query_embbeding(query)
    embbeding_similarity = {}
    for key, value in doc_embeddings.items():
        embbeding_similarity[key] = cosine_similarity([query_vector], [value])
    similar_docs = dict(sorted(embbeding_similarity.items(), key=lambda item: item[1]), reversed=True)
    return dict(islice(similar_docs.items(), top))

In [15]:
hits = 100 #number of similar docs to be returned
if elastic:
    most_similar_docs = find_similar_doc_with_query('molecule', index_name, es, hits)
else:
    most_similar_docs = find_similar_doc_with_query_disk('molecule', hits)

In [16]:
most_similar_docs

[{'_index': 'genomics-word2vec',
  '_type': '_doc',
  '_id': '27650',
  '_score': 1.1626884,
  '_ignored': ['AB.keyword'],
  '_source': {'AB': 'Structural maintenance of chromosomes (SMC) proteins play central roles in higher-order chromosome dynamics from bacteria to humans. In eukaryotes, two different SMC protein complexes, condensin and cohesin, regulate chromosome condensation and sister chromatid cohesion, respectively. Each of the complexes consists of a heterodimeric pair of SMC subunits and two or three non-SMC subunits. Previous studies have shown that a bacterial SMC homodimer has a symmetrical structure in which two long coiled-coil arms are connected by a flexible hinge. A catalytic domain with DNA',
   'AD': 'Department of Cell Biology, Duke University Medical Center, Durham, NC 27710, USA.',
   'AID': 'jcb.200111002 [pii]',
   'CY': 'United States',
   'DA': '20020205',
   'DCOM': '20020405',
   'DP': '2002 Feb 4',
   'EDAT': '2002/01/30 10:00',
   'IP': '3',
   'IS': '0

# Boolean retrieval

The first task for the boolean retrieval part is to index the title and abstract of all documents using ElasticSearch.

In [12]:
boolean = {
  "settings" : {
    "index.blocks.read_only": False,
    # a single shard, so we do not suffer from approximate document frequencies
    "number_of_shards" : 1
  },
  "mappings": {
      "properties": {
        "AB": {
          "type": "text",
          "copy_to": "title-abstract",
          "similarity": "boolean"
        },
        "TI": {
          "type": "text",
          "copy_to": "title-abstract",
          "similarity": "boolean"
        },
        "title-abstract": {  # compound field
          "type": "text",
          "similarity": "boolean"
        }
      }
  }
}

es = elasticsearch.Elasticsearch(host='localhost', timeout=40)
index_documents(es, 'data01/FIR-s05-medline.json', 'genomics-bool', body=boolean)

(263080, [])

In [13]:
es.indices.get(index='genomics-bool')

{'genomics-bool': {'aliases': {},
  'mappings': {'properties': {'AB': {'type': 'text',
     'copy_to': ['title-abstract'],
     'similarity': 'boolean'},
    'AD': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'AID': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'CI': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'CIN': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'CN': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'CON': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'CY': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'DA': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'DCOM': {'type': 'text',
     'fields': {'keyword': {'t

# Making the TREC run

We save the corresponding documents for each of the retrieval methods, in separate files, so that they can be accessed later. The runs are controlled by parameter *new_run* which creates new files only if it is set to True.

In [42]:
new_run = True

In [43]:
if new_run:
    def make_trec_run(es, topics_file_name, run_file_name, index_name, run_name):
        with open(run_file_name, 'w') as run_file:
            with open(topics_file_name, 'r') as test_queries:
                for line in test_queries:
                    (qid, query) = line.strip().split('\t')
                    results = es.search(index=index_name, body={'query':{
                        'multi_match':{
                            'query': query}
                    }, 'fields':['TI', 'AB']})


                    for i, d in enumerate(results['hits']['hits']):
                        did = str(d['_source']['PMID']) + ' '
                        rank = str(i) + ' '
                        score = str(d['_score']) + ' '
                        run_file.write(str(qid) + ' Q0 ' + did + rank + score + str(run_name) + '\n')

    make_trec_run(es, 'data01/FIR-s05-training-queries-simple.txt', 'word2vec.run', 'genomics-word2vec', run_name='test_w2v')
    #make_trec_run(es, 'data01/FIR-s05-training-queries-simple.txt', 'boolean.run', 'genomics-bool', run_name='test_bool')

## Retrieving the extracted documents

Now we will save the corresponding entries in 4 vectors, which will be later used as input for the statistical tests.

In [44]:
def read_qrels_file(qrels_file):  # reads the content of he qrels file
    trec_relevant = dict()  # query_id -> set([docid1, docid2, ...])
    with open(qrels_file, 'r') as qrels:
        for line in qrels:
            (qid, q0, doc_id, rel) = line.strip().split()
            if qid not in trec_relevant:
                trec_relevant[qid] = set()
            if (rel == "1"):
                trec_relevant[qid].add(doc_id)
    return trec_relevant

def read_run_file(run_file):  
    # read the content of the run file produced by our IR system 
    # (in the following exercises you will create your own run_files)
    trec_retrieved = dict()  # query_id -> [docid1, docid2, ...]
    with open(run_file, 'r') as run:
        for line in run:
            (qid, q0, doc_id, rank, score, tag) = line.strip().split()
            if qid not in trec_retrieved:
                trec_retrieved[qid] = []
            trec_retrieved[qid].append(doc_id) 
    return trec_retrieved
    
all_relevant = read_qrels_file('data01/FIR-s05-training-qrels.txt')
all_w2v = read_run_file('word2vec.run')
all_bool = read_run_file('boolean.run')

# Metrics implementation

Underneath, a few metrics that will be computed for each of the 2 Information Retrieval systems have been created, as presented in [Manning, Raghavan and Schütze, 2008](https://nlp.stanford.edu/IR-book/pdf/08eval.pdf).

Because the chosen IR systems provide unranked results, the main selected metrics for comparison the average precision and average F-measure for all topics.

In [45]:
# count all relevant retrieved entries
def count_rel(rel, ret):
    r = 0
    for entry in ret:
        if entry in rel:
            r+=1
    return r

# measure of precision given list of relevant documents and retrieved documents
def precision(rel, ret):
    return count_rel(rel, ret)/len(ret)

# measure of recall given list of relevant documents and retrieved documents
def recall(rel, ret):
    return count_rel(rel, ret)/len(rel)

In [60]:
# F-measure given Beta
def f_measure(rel, ret, b):
    p = precision(rel, ret)
    r = recall(rel, ret)
    coef = b**2
    
    if p == 0 and r == 0:
        return 0
    return ((coef + 1)*p*r) / (coef * p + r)

In [55]:
# average precision for topics qs
def topic_precision(all_rel, all_ret):
    total_prec = 0.0
    for q in all_rel:
        total_prec += precision(list(all_rel[q]), list(all_ret[q]))
    return total_prec/len(all_rel)

# average recall for topics qs
def topic_recall(all_rel, all_ret):
    total_rec = 0
    for q in all_rel:
        total_rec += recall(list(all_rel[q]), list(all_ret[q]))
    return total_rec/len(all_rel)

# average f_measire for topics qs
def topic_f_measure(all_rel, all_ret, b):
    total_f = 0
    for q in all_rel:
        total_f += f_measure(list(all_rel[q]), list(all_ret[q]), b)
    return total_f/len(all_rel)


# Statistical tests

In line with [Smucker, Allan and Carterette, 2007](https://ciir-publications.cs.umass.edu/getpdf.php?id=744), 3 statistical tests will be performed for both the average precision and avergae F-measure computed above. Each test will return a p-value, to be compared with a given threshold within a separate function.

## Sign test

The sign test computes the p-value based on the number of instances in which one of the IR sytems performs better than the other.

In [102]:
def sign_test(metric, all_rel, all_ret_1, all_ret_2, b):
    # count for better and lower performance
    improve = 0
    deter = 0
    for q in all_rel:
        if metric == f_measure:
            v1 = metric(all_rel[q], all_ret_1[q], b)
            v2 = metric(all_rel[q], all_ret_2[q], b)
        else:
            v1 = metric(all_rel[q], all_ret_1[q])
            v2 = metric(all_rel[q], all_ret_2[q])
        if v1 > v2:
            improve += 1
        elif v1 < v2:
            deter += 1
    # the p-value will be the sum of probabilities of improvement and respectively, deterioration in performance
    p_val = (improve + deter) / len(all_rel)
    return p_val

## Wilcoxon test

The Wilcoxon test computes a W-value based on a sum of ranks of positive differences in a chosen metric, between 2 IR systems.

In [103]:
def wilcoxon_test(metric, all_rel, all_ret_1, all_ret_2, b):
    dif = [] #signs for all differences
    abs_dif = [] #absolute values
    # for each item compute the absolute difference between the 2 IR measurements
    for q in all_rel:
        if metric == f_measure:
            d = metric(all_rel[q], all_ret_1[q], b) - metric(all_rel[q], all_ret_2[q], b)
        else:
            d = metric(all_rel[q], all_ret_1[q]) - metric(all_rel[q], all_ret_2[q])
        
        # save only strictly positive differences
        if d != 0:
            dif.append(d)
            abs_dif.append(abs(d))
        
    idx = np.argsort(abs_dif)
    W = 0
    
    # compute the sum of ranks for positive differences
    for i in idx:
        if abs_dif[i] > 0:
            W += i
            
    return W

## Student t-test

For this last test, we apply the Student t-test for independent samples with unequal, unknown variances and equal means, which returns a test statistic t-value, to be compared with the critical value within the following exercise. 

Additionally, we compute the degrees of freedom to be used to determine the critical value within a separate function.

In [104]:
def t_test(metric, all_rel, all_ret_1, all_ret_2, b):
    n = len(all_rel) #sample size
    m1 = m2 = []
    
    # for each IR save the performance measurements
    for q in all_rel:
        if metric == f_measure:
            m1.append(metric(all_rel[q], all_ret_1[q], b))
            m2.append(metric(all_rel[q], all_ret_2[q], b))
        else:
            m1.append(metric(all_rel[q], all_ret_1[q]))
            m2.append(metric(all_rel[q], all_ret_2[q]))
    
    # compute the sample means and variances
    mean1 = np.mean(m1)
    mean2 = np.mean(m2)
    var1 = np.var(m1)
    var2 = np.var(m2)
    
    # mu1 = mu2 under H0
    t_val = (mean1 + mean2)
    t_val /= math.sqrt(var1/n + var2/n)
    
    # compute degrees of freedom
    d_freedom = deg_freedom(var1, var2, n, n)
    
    return t_val, d_freedom

# degrees of freedom given variances and sample sizes 
def deg_freedom(var1, var2, n1, n2):
    f1 = var1 / n1
    f2 = var2 / n2
    nom = (f1 + f2) ** 2
    denom = f1 ** 2 / (n1 - 1) + f2 ** 2 / (n2 - 1)
    
    return nom / denom

# Analysis

Lastly, we use the tests implemented above to determine whether the performance of the 2 IR systems differs significantly. For this purpose, 2 hypothesis are defined and the value of each performed test will be compared with a relevant critical value (taken from standard tables, according to the sample size, confidence level and, possibly, degrees of freedom).

H_0.0: Word2Vec has the same precision as the boolean IR (i.e. precision(W2V) - precision(bool) = 0).
<br>H_0.1: Word2Vec has a significantly higher precision than boolean search (i.e. precision(W2V) - precision(bool) > 0).

H_1.0: Word2Vec has the same F-measure as the boolean IR (i.e. precision(W2V) - precision(bool) = 0).
<br>H_1.1: Word2Vec has a significantly higher F-measure than boolean search (i.e. precision(W2V) - precision(bool) > 0).

In [111]:
def significance(test, metric, all_rel, all_ret_1, all_ret_2, c_val, beta):
    if test == t_test:
        result = test(metric, all_rel, all_ret_1, all_ret_2, beta)[1]
    else:
        result = test(metric, all_rel, all_ret_1, all_ret_2, beta)
        
    if result > c_val:
        return str(test)[10:-19] + ' -> reject H0'
    else:
        return str(test)[10:-19] + ' -> do not reject H0'

In [61]:
def check_metrics(run, all_rel, all_ret, beta):
    print(run)
    print('Precision:', topic_precision(all_rel, all_ret))
    print('Recall:', topic_recall(all_rel, all_ret))
    print('F-measure:', topic_f_measure(all_rel, all_ret, beta))
    
check_metrics('Word2vec', all_relevant, all_w2v, 1)
check_metrics('Boolean', all_relevant, all_bool, 1)

Word2vec
Precision: 0.047368421052631594
Recall: 0.20000606354596168
F-measure: 0.06719104443845815
Boolean
Precision: 0.018421052631578946
Recall: 0.10047295658501092
F-measure: 0.026215679409819715


In [92]:
print('Number of samples:', len(all_relevant))
print('Degrees of freedom:', t_test(precision, all_relevant, all_w2v, all_bool, 1)[1])

Number of samples: 38
Degrees of freedom: 74.0


In [114]:
beta = 1

print('Significance tests for precision:')
print(significance(sign_test, precision, all_relevant, all_w2v, all_bool, 12, beta))
print(significance(wilcoxon_test, precision, all_relevant, all_w2v, all_bool, 256, beta))
print(significance(t_test, precision, all_relevant, all_w2v, all_bool, 1.666, beta))

print('\nSignificance tests for F-measure:')
print(significance(sign_test, f_measure, all_relevant, all_w2v, all_bool, 12, beta))
print(significance(wilcoxon_test, f_measure, all_relevant, all_w2v, all_bool, 256, beta))
print(significance(t_test, f_measure, all_relevant, all_w2v, all_bool, 1.666, beta))

Significance tests for precision:
sign_test -> do not reject H0
wilcoxon_test -> do not reject H0
t_test -> reject H0

Significance tests for F-measure:
sign_test -> do not reject H0
wilcoxon_test -> do not reject H0
t_test -> reject H0
