In [1]:
import sys
import os
from elasticsearch import Elasticsearch
from zipfile import ZipFile
import re
import pandas as pd
import json
import requests
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
import nltk
import pickle
import pytrec_eval
nltk.download('stopwords')
nltk.download('punkt')
from elasticsearch import NotFoundError

es = Elasticsearch()

[nltk_data] Downloading package stopwords to /home/katya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/katya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
input_dir = 'data/docs'
output_dir = 'b07k1'

In [3]:
def strip_punct(s):
    s = re.sub('[^A-Za-z0-9]', ' ', s)
    s = s.lower()
    return " ".join(s.split())

In [4]:
index = 'b07k1_index'

## Create new index, adjust BM25-parameters

In [5]:
request_body = {
    "settings" : {
	        "number_of_shards": 1,
	        "number_of_replicas": 0,
        "similarity": {
      "default": { 
        "type": "BM25",
        "b":0.7,
        "k1":1
      }
    }
	    }
	}

es.indices.create(index = index, body = request_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'b07k1_index'}

In [6]:
with open('bulk_data_json.txt') as f:
    bulk_data = json.load(f)

In [7]:
len(bulk_data)

10060

In [8]:
for x in bulk_data[::2]:
    x['index']['_index']=index
    #print(x)
print(bulk_data[0])

{'index': {'_index': 'b07k1_index', '_type': 'doc', '_id': 'clueweb12-1913wb-84-09751'}}


In [9]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [10]:
bulks = chunks(bulk_data, 100)

In [11]:
bulks

<generator object chunks at 0x7fa76599ddd0>

In [12]:
for x in bulks:
    res = es.bulk(index = index, body = x)


#es.search(body={"query": {"match_all": {}}}, index = 'example_index')
#es_indices.get_mapping(index = 'example_index')



In [13]:
res = es.get(index=index,doc_type='doc',id='clueweb12-1214wb-88-29751')



In [14]:
res

{'_index': 'b07k1_index',
 '_type': 'doc',
 '_id': 'clueweb12-1214wb-88-29751',
 '_version': 1,
 '_seq_no': 3633,
 '_primary_term': 1,
 'found': True,
 '_source': {'query': 'what and is and the and difference and between and sex and and and love',
  'title': '<em>Difference</em> <em>Between</em> <em>Love</em> <em>And</em> <em>Sex</em> - Student.com Articles',
  'title_lem': 'difference love sex student article',
  'num': '1',
  'uuid': 'b0054765-4007-52c9-a7c4-70c15891b4cc',
  'score': 2406.7341,
  'document': 'Difference Between Love And Sex\nYou are here:\nStudent.com Articles Student Health & Sexuality Sexual Health Difference Between Love And Sex\nRelated Articles\n<p>Search, Compare, and Apply for student loans at <a href="http://www.simpletuition.com?brnd=studentcom" title="Simpletuition.com" target="_blank">Simpletuition.com</a>.</p>\nPaying For College\nSex VS. Love Love and sex are NOT the same thing. Love is an emotion or a feeling. There is no one definition of love because 

In [14]:
#es.indices.delete(index='b05k1_index')

{'acknowledged': True}

In [15]:
es.cat.count(index, params={"format": "json"})

[{'epoch': '1613039997', 'timestamp': '10:39:57', 'count': '5017'}]

## Create different runs

In [16]:
f = open("data/topics_lemmatized.txt", 'rb')
queries_lem = pickle.load(f)
f.close()

In [19]:
def search(search_param, filename, q, ind):
    
    qid = []
    Q0 = []
    doc = []
    rank = []
    score = []
    tag = []

    num=1

    for i in q:
        for x in search_param['query']['bool']['should']:
            if type(i) == dict:
                for key in x['match']:
                    x['match'][key] = i['title']
            else:
                for key in x['match']:
                    x['match'][key] = i

    
        response = es.search(index=ind, body=search_param)
        r = 1
        for x in response['hits']['hits']:
            qid.append(num)
            Q0.append('Q0')
            doc.append(x['_id'])
            rank.append(r)
            score.append(x['_score'])
            tag.append('uh-t2-thor')
            r+=1
        num+=1
    
   
    qrels = {'qid': qid, 'Q0': Q0, 'doc': doc, 'rank':rank, 'score':score, 'tag':tag}

    df = pd.DataFrame(qrels)
    df.to_csv((output_dir+'/'+filename+'.txt'), sep = ' ', index = False, header = False)

In [20]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [ {
         'match': {
            "title_lem": ''
        }}
      ]
    }
    }
}

search(search_param, 'run_title', queries_lem, index)

In [21]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [ {
         'match': {
            "lem": ''
        }}
      ]
    }
    }
}

search(search_param, 'run_doc', queries_lem, index)

In [22]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [ {
         'match': {
            "args": ''
        }}
      ]
    }
    }
}

search(search_param, 'run_arg', queries_lem, index)

In [23]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title_lem": ""
          }
        },
        {
          "match": {
            "args": ""
          }
        }
      ]
    }
  }
}

search(search_param, 'run_title_arg', queries_lem, index)

In [24]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title_lem": ""
          }
        },
        {
          "match": {
            "lem": ""
          }
        }
      ]
    }
  }
}

search(search_param, 'run_title_doc', queries_lem, index)

In [25]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [
        {
          "match": {
            "lem": ""
          }
        },
        {
          "match": {
            "args": ""
          }
        }
      ]
    }
  }
}

search(search_param, 'run_doc_arg', queries_lem, index)

In [26]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [
        {
          "match": {
            "lem": ""
          }
        },
        {
          "match": {
            "title_lem": ""
          }
        },
        {
          "match": {
            "args": ""
          }
        }
      ]
    }
  }
}

search(search_param, 'run_title_doc_arg', queries_lem, index)

## Add synonyms

In [27]:
def search_extended(search_param, filename, q, ind):
    
    qid = []
    Q0 = []
    doc = []
    rank = []
    score = []
    tag = []

    num=1
#['bool']['should']
    for idx, row in q.iterrows():
            
            #for key in x['match']:
                #x['match'][key] = row['query']
        for x in search_param['query']['bool']['should']:
            for key in x["match"]:
                #print(key)
                if "boost" in x['match'][key]:
                    x['match'][key]["query"] = row['query']
                else:
                    x['match'][key] = row['syn']
        #print(search_param)
        response = es.search(index=ind, body=search_param)
        r = 1
        for x in response['hits']['hits']:
            qid.append(num)
            Q0.append('Q0')
            doc.append(x['_id'])
            rank.append(r)
            score.append(x['_score'])
            tag.append('uh-t2-thor')
            r+=1
        num+=1
    
   
    qrels = {'qid': qid, 'Q0': Q0, 'doc': doc, 'rank':rank, 'score':score, 'tag':tag}

    df = pd.DataFrame(qrels)
    df.to_csv((output_dir+'/'+filename+'.txt'), sep = ' ', index = False, header = False)

In [28]:
df_syn = pd.read_csv('data/q_for_syn.tsv', sep = '\t')

In [29]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"title_lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"title_lem": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_title_syn', df_syn, index)

In [30]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"lem": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_doc_syn', df_syn, index)

In [31]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"args": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"args": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_args_syn', df_syn, index)

In [32]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"title_lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"args": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"title_lem": {""}}},
                    {"match": {"args": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_title_arg_syn', df_syn, index)

In [33]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"title_lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"title_lem": {""}}},
                    {"match": {"lem": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_title_doc_syn', df_syn, index)

In [34]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"args": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"lem": {""}}},
                    {"match": {"args": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_doc_arg_syn', df_syn, index)

In [35]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"title_lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"args": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"title_lem": {""}}},
                    {"match": {"args": {""}}},
                    {"match": {"lem": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_title_doc_arg_syn', df_syn, index)

## Evaluate and compare results

In [36]:
def evaluate_run(run_path, qrel_path):
    run = pd.read_csv(run_path, sep = ' ', names = ['topic','Q0','id','rank','score','team'])

    qrel = pd.read_csv(qrel_path, sep = ' ', names = ['topic','Q0','id','relevance'])

    qrels = {}
    for i in range(1,51):
        qrels[str(i)] = {}
    
    for idx, row in qrel.iterrows():
        qrels[str(row['topic'])][row['id']] = row['relevance']
    
    runs = {}
    for i in range(1,51):
        runs[str(i)] = {}
    
    for idx, row in run.iterrows():
        runs[str(row['topic'])][row['id']] = row['score']

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels,{'map_cut', 'ndcg_cut', 'recall', 'P'})

    res = evaluator.evaluate(runs)

    results = {}
    recall = {}
    pres = {}
    for key in res:
        results[key] = res[key]['ndcg_cut_10']
        recall[key] = res[key]['recall_10']
        pres[key] = res[key]['P_10']
        #print(res[key])
    
    filename = re.split('\.', sys.argv[1])
    with open((filename[0]+'_evaluated.json'), 'w') as f:
        json.dump(results, f)
        
    s = sum(results.values())/50
    r = sum(recall.values())/50
    p = sum(pres.values())/50
    if r+p != 0:
        f1 = (2*(r*p)/(r+p))
    else: 
        f1 = 0
    '''
    print("Average ndcg_cut_10: ", s)
    print("Average recall_10: ", r)
    print("Av. precision: ", p)
    print("F1-score: ",f1)
    '''    
    return s, r, p, f1


In [37]:
run = []
ndcg = []
rec = []
prec = []
f1s = []

for filename in os.listdir(output_dir):
    if filename.endswith(".txt"):
        #print(filename)
        s, r, p, f1 = evaluate_run((output_dir+"/{}").format(filename), 'touche2020-task2-relevance-withbaseline(1).qrels')
        run.append(filename)
        ndcg.append(s)
        rec.append(r)
        prec.append(p)
        f1s.append(f1)

In [38]:
df = pd.DataFrame({'run':run,'ndcg_cut10':ndcg, 'recall_10':rec, 'precision':prec, 'f1-score':f1s})

### All results

In [39]:
df.sort_values(by = ['ndcg_cut10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
10,run_title_doc_syn.txt,0.435886,0.275804,0.422,0.333587
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
0,run_title_doc_arg.txt,0.378856,0.238482,0.36,0.286905
1,run_title_arg_syn.txt,0.378704,0.249624,0.37,0.298119
11,run_title_doc_arg_syn.txt,0.372436,0.236267,0.358,0.284665
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
7,run_doc.txt,0.321105,0.198525,0.296,0.237656
4,run_doc_syn.txt,0.316965,0.199248,0.3,0.239458


In [40]:
df.sort_values(by = ['recall_10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.435886,0.275804,0.422,0.333587
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
1,run_title_arg_syn.txt,0.378704,0.249624,0.37,0.298119
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
0,run_title_doc_arg.txt,0.378856,0.238482,0.36,0.286905
11,run_title_doc_arg_syn.txt,0.372436,0.236267,0.358,0.284665
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
4,run_doc_syn.txt,0.316965,0.199248,0.3,0.239458
7,run_doc.txt,0.321105,0.198525,0.296,0.237656


In [41]:
df.sort_values(by = ['precision'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.435886,0.275804,0.422,0.333587
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
1,run_title_arg_syn.txt,0.378704,0.249624,0.37,0.298119
0,run_title_doc_arg.txt,0.378856,0.238482,0.36,0.286905
11,run_title_doc_arg_syn.txt,0.372436,0.236267,0.358,0.284665
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
4,run_doc_syn.txt,0.316965,0.199248,0.3,0.239458
7,run_doc.txt,0.321105,0.198525,0.296,0.237656


In [42]:
df.sort_values(by = ['f1-score'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.435886,0.275804,0.422,0.333587
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
1,run_title_arg_syn.txt,0.378704,0.249624,0.37,0.298119
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
0,run_title_doc_arg.txt,0.378856,0.238482,0.36,0.286905
11,run_title_doc_arg_syn.txt,0.372436,0.236267,0.358,0.284665
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
4,run_doc_syn.txt,0.316965,0.199248,0.3,0.239458
7,run_doc.txt,0.321105,0.198525,0.296,0.237656


### Results without doc

In [43]:
df_p = df[~df.run.str.contains("doc")]

In [44]:
df_p.sort_values(by = ['ndcg_cut10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
1,run_title_arg_syn.txt,0.378704,0.249624,0.37,0.298119
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
9,run_arg.txt,0.265277,0.179445,0.262,0.213003
8,run_args_syn.txt,0.263858,0.176608,0.254,0.208349


In [45]:
df_p.sort_values(by = ['recall_10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
1,run_title_arg_syn.txt,0.378704,0.249624,0.37,0.298119
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
9,run_arg.txt,0.265277,0.179445,0.262,0.213003
8,run_args_syn.txt,0.263858,0.176608,0.254,0.208349


In [46]:
df_p.sort_values(by = ['precision'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
1,run_title_arg_syn.txt,0.378704,0.249624,0.37,0.298119
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
9,run_arg.txt,0.265277,0.179445,0.262,0.213003
8,run_args_syn.txt,0.263858,0.176608,0.254,0.208349


In [47]:
df_p.sort_values(by = ['f1-score'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
1,run_title_arg_syn.txt,0.378704,0.249624,0.37,0.298119
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
9,run_arg.txt,0.265277,0.179445,0.262,0.213003
8,run_args_syn.txt,0.263858,0.176608,0.254,0.208349


### Results for pipeline

In [48]:
df_p = df[~df.run.str.contains("arg")]
df_p = df_p[~df_p.run.str.contains("syn")]

In [49]:
df_p.sort_values(by = ['ndcg_cut10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
2,run_title.txt,0.371139,0.233088,0.358,0.282345
7,run_doc.txt,0.321105,0.198525,0.296,0.237656


In [50]:
df_p.sort_values(by = ['recall_10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
2,run_title.txt,0.371139,0.233088,0.358,0.282345
7,run_doc.txt,0.321105,0.198525,0.296,0.237656


In [51]:
df_p.sort_values(by = ['precision'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
2,run_title.txt,0.371139,0.233088,0.358,0.282345
7,run_doc.txt,0.321105,0.198525,0.296,0.237656


In [52]:
df_p.sort_values(by = ['f1-score'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
2,run_title.txt,0.371139,0.233088,0.358,0.282345
7,run_doc.txt,0.321105,0.198525,0.296,0.237656


### Results for synonyms

In [53]:
df_syn = df[~df.run.str.contains("arg")]

In [54]:
df_syn.sort_values(by = ['ndcg_cut10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
10,run_title_doc_syn.txt,0.435886,0.275804,0.422,0.333587
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
7,run_doc.txt,0.321105,0.198525,0.296,0.237656
4,run_doc_syn.txt,0.316965,0.199248,0.3,0.239458


In [55]:
df_syn.sort_values(by = ['recall_10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.435886,0.275804,0.422,0.333587
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
4,run_doc_syn.txt,0.316965,0.199248,0.3,0.239458
7,run_doc.txt,0.321105,0.198525,0.296,0.237656


In [56]:
df_syn.sort_values(by = ['precision'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.435886,0.275804,0.422,0.333587
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
4,run_doc_syn.txt,0.316965,0.199248,0.3,0.239458
7,run_doc.txt,0.321105,0.198525,0.296,0.237656


In [57]:
df_syn.sort_values(by = ['f1-score'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.435886,0.275804,0.422,0.333587
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
2,run_title.txt,0.371139,0.233088,0.358,0.282345
13,run_title_syn.txt,0.353955,0.220739,0.342,0.268305
4,run_doc_syn.txt,0.316965,0.199248,0.3,0.239458
7,run_doc.txt,0.321105,0.198525,0.296,0.237656


### Results for arguments

In [58]:
df_arg = df[~df.run.str.contains("syn")]

In [59]:
df_arg.sort_values(by = ['ndcg_cut10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
0,run_title_doc_arg.txt,0.378856,0.238482,0.36,0.286905
2,run_title.txt,0.371139,0.233088,0.358,0.282345
7,run_doc.txt,0.321105,0.198525,0.296,0.237656
6,run_doc_arg.txt,0.285208,0.190936,0.278,0.226386
9,run_arg.txt,0.265277,0.179445,0.262,0.213003


In [60]:
df_arg.sort_values(by = ['recall_10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
0,run_title_doc_arg.txt,0.378856,0.238482,0.36,0.286905
2,run_title.txt,0.371139,0.233088,0.358,0.282345
7,run_doc.txt,0.321105,0.198525,0.296,0.237656
6,run_doc_arg.txt,0.285208,0.190936,0.278,0.226386
9,run_arg.txt,0.265277,0.179445,0.262,0.213003


In [61]:
df_arg.sort_values(by = ['precision'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
0,run_title_doc_arg.txt,0.378856,0.238482,0.36,0.286905
2,run_title.txt,0.371139,0.233088,0.358,0.282345
7,run_doc.txt,0.321105,0.198525,0.296,0.237656
6,run_doc_arg.txt,0.285208,0.190936,0.278,0.226386
9,run_arg.txt,0.265277,0.179445,0.262,0.213003


In [62]:
df_arg.sort_values(by = ['f1-score'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.438816,0.2719,0.422,0.330715
12,run_title_arg.txt,0.383981,0.248372,0.372,0.297868
0,run_title_doc_arg.txt,0.378856,0.238482,0.36,0.286905
2,run_title.txt,0.371139,0.233088,0.358,0.282345
7,run_doc.txt,0.321105,0.198525,0.296,0.237656
6,run_doc_arg.txt,0.285208,0.190936,0.278,0.226386
9,run_arg.txt,0.265277,0.179445,0.262,0.213003
