In [79]:
import sys
import os
from elasticsearch import Elasticsearch
from zipfile import ZipFile
import re
import pandas as pd
import json
import requests
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
import nltk
import pickle
import pytrec_eval
nltk.download('stopwords')
nltk.download('punkt')
from elasticsearch import NotFoundError

es = Elasticsearch()

[nltk_data] Downloading package stopwords to /home/katya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/katya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [80]:
input_dir = 'data/docs'
output_dir = 'b067k12'

In [81]:
def strip_punct(s):
    s = re.sub('[^A-Za-z0-9]', ' ', s)
    s = s.lower()
    return " ".join(s.split())

In [82]:
index = 'b067k12_index'

## Create new index, adjust BM25-parameters

In [85]:
request_body = {
    "settings" : {
	        "number_of_shards": 1,
	        "number_of_replicas": 0,
        "similarity": {
      "default": { 
        "type": "BM25",
        "b":0.67,
        "k1":1.2
      }
    }
	    }
	}

es.indices.create(index = index, body = request_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'b067k12_index'}

In [86]:
with open('bulk_data_json.txt') as f:
    bulk_data = json.load(f)

In [87]:
len(bulk_data)

10060

In [88]:
for x in bulk_data[::2]:
    x['index']['_index']=index
    #print(x)
print(bulk_data[0])

{'index': {'_index': 'b067k12_index', '_type': 'doc', '_id': 'clueweb12-1913wb-84-09751'}}


In [89]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [90]:
bulks = chunks(bulk_data, 100)

In [91]:
bulks

<generator object chunks at 0x7f281f5b1dd0>

In [92]:
for x in bulks:
    res = es.bulk(index = index, body = x)


#es.search(body={"query": {"match_all": {}}}, index = 'example_index')
#es_indices.get_mapping(index = 'example_index')



In [93]:
res = es.get(index=index,doc_type='doc',id='clueweb12-1214wb-88-29751')



In [94]:
res

{'_index': 'b067k12_index',
 '_type': 'doc',
 '_id': 'clueweb12-1214wb-88-29751',
 '_version': 1,
 '_seq_no': 3633,
 '_primary_term': 1,
 'found': True,
 '_source': {'query': 'what and is and the and difference and between and sex and and and love',
  'title': '<em>Difference</em> <em>Between</em> <em>Love</em> <em>And</em> <em>Sex</em> - Student.com Articles',
  'title_lem': 'difference love sex student article',
  'num': '1',
  'uuid': 'b0054765-4007-52c9-a7c4-70c15891b4cc',
  'score': 2406.7341,
  'document': 'Difference Between Love And Sex\nYou are here:\nStudent.com Articles Student Health & Sexuality Sexual Health Difference Between Love And Sex\nRelated Articles\n<p>Search, Compare, and Apply for student loans at <a href="http://www.simpletuition.com?brnd=studentcom" title="Simpletuition.com" target="_blank">Simpletuition.com</a>.</p>\nPaying For College\nSex VS. Love Love and sex are NOT the same thing. Love is an emotion or a feeling. There is no one definition of love becaus

In [95]:
#es.indices.delete(index=index)

In [96]:
es.cat.count(index, params={"format": "json"})

[{'epoch': '1613042723', 'timestamp': '11:25:23', 'count': '4638'}]

## Create different runs

In [97]:
f = open("data/topics_lemmatized.txt", 'rb')
queries_lem = pickle.load(f)
f.close()

In [98]:
def search(search_param, filename, q, ind):
    
    qid = []
    Q0 = []
    doc = []
    rank = []
    score = []
    tag = []

    num=1

    for i in q:
        for x in search_param['query']['bool']['should']:
            if type(i) == dict:
                for key in x['match']:
                    x['match'][key] = i['title']
            else:
                for key in x['match']:
                    x['match'][key] = i

    
        response = es.search(index=ind, body=search_param)
        r = 1
        for x in response['hits']['hits']:
            qid.append(num)
            Q0.append('Q0')
            doc.append(x['_id'])
            rank.append(r)
            score.append(x['_score'])
            tag.append('uh-t2-thor')
            r+=1
        num+=1
    
   
    qrels = {'qid': qid, 'Q0': Q0, 'doc': doc, 'rank':rank, 'score':score, 'tag':tag}

    df = pd.DataFrame(qrels)
    df.to_csv((output_dir+'/'+filename+'.txt'), sep = ' ', index = False, header = False)

In [99]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [ {
         'match': {
            "title_lem": ''
        }}
      ]
    }
    }
}

search(search_param, 'run_title', queries_lem, index)

In [100]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [ {
         'match': {
            "lem": ''
        }}
      ]
    }
    }
}

search(search_param, 'run_doc', queries_lem, index)

In [101]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [ {
         'match': {
            "args": ''
        }}
      ]
    }
    }
}

search(search_param, 'run_arg', queries_lem, index)

In [102]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title_lem": ""
          }
        },
        {
          "match": {
            "args": ""
          }
        }
      ]
    }
  }
}

search(search_param, 'run_title_arg', queries_lem, index)

In [103]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [
        {
          "match": {
            "title_lem": ""
          }
        },
        {
          "match": {
            "lem": ""
          }
        }
      ]
    }
  }
}

search(search_param, 'run_title_doc', queries_lem, index)

In [104]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [
        {
          "match": {
            "lem": ""
          }
        },
        {
          "match": {
            "args": ""
          }
        }
      ]
    }
  }
}

search(search_param, 'run_doc_arg', queries_lem, index)

In [105]:
search_param = {
    
    'size': 50,
    "query": {
    "bool": {
      "should": [
        {
          "match": {
            "lem": ""
          }
        },
        {
          "match": {
            "title_lem": ""
          }
        },
        {
          "match": {
            "args": ""
          }
        }
      ]
    }
  }
}

search(search_param, 'run_title_doc_arg', queries_lem, index)

## Add synonyms

In [106]:
def search_extended(search_param, filename, q, ind):
    
    qid = []
    Q0 = []
    doc = []
    rank = []
    score = []
    tag = []

    num=1
#['bool']['should']
    for idx, row in q.iterrows():
            
            #for key in x['match']:
                #x['match'][key] = row['query']
        for x in search_param['query']['bool']['should']:
            for key in x["match"]:
                #print(key)
                if "boost" in x['match'][key]:
                    x['match'][key]["query"] = row['query']
                else:
                    x['match'][key] = row['syn']
        #print(search_param)
        response = es.search(index=ind, body=search_param)
        r = 1
        for x in response['hits']['hits']:
            qid.append(num)
            Q0.append('Q0')
            doc.append(x['_id'])
            rank.append(r)
            score.append(x['_score'])
            tag.append('uh-t2-thor')
            r+=1
        num+=1
    
   
    qrels = {'qid': qid, 'Q0': Q0, 'doc': doc, 'rank':rank, 'score':score, 'tag':tag}

    df = pd.DataFrame(qrels)
    df.to_csv((output_dir+'/'+filename+'.txt'), sep = ' ', index = False, header = False)

In [107]:
df_syn = pd.read_csv('data/q_for_syn.tsv', sep = '\t')

In [108]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"title_lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"title_lem": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_title_syn', df_syn, index)

In [109]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"lem": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_doc_syn', df_syn, index)

In [110]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"args": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"args": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_args_syn', df_syn, index)

In [111]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"title_lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"args": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"title_lem": {""}}},
                    {"match": {"args": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_title_arg_syn', df_syn, index)

In [112]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"title_lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"title_lem": {""}}},
                    {"match": {"lem": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_title_doc_syn', df_syn, index)

In [113]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"args": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"lem": {""}}},
                    {"match": {"args": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_doc_arg_syn', df_syn, index)

In [114]:
search_param = {
    
    'size': 50, 
    "query": {
            "bool":{
                "should": [
                    {"match": {"title_lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"args": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"lem": {
                        "query":"",
                    "boost":5}}},
                    {"match": {"title_lem": {""}}},
                    {"match": {"args": {""}}},
                    {"match": {"lem": {""}}}
                ]
            }
          }
      
}
search_extended(search_param, 'run_title_doc_arg_syn', df_syn, index)

## Evaluate and compare results

In [115]:
def evaluate_run(run_path, qrel_path):
    run = pd.read_csv(run_path, sep = ' ', names = ['topic','Q0','id','rank','score','team'])

    qrel = pd.read_csv(qrel_path, sep = ' ', names = ['topic','Q0','id','relevance'])

    qrels = {}
    for i in range(1,51):
        qrels[str(i)] = {}
    
    for idx, row in qrel.iterrows():
        qrels[str(row['topic'])][row['id']] = row['relevance']
    
    runs = {}
    for i in range(1,51):
        runs[str(i)] = {}
    
    for idx, row in run.iterrows():
        runs[str(row['topic'])][row['id']] = row['score']

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels,{'map_cut', 'ndcg_cut', 'recall', 'P'})

    res = evaluator.evaluate(runs)

    results = {}
    recall = {}
    pres = {}
    for key in res:
        results[key] = res[key]['ndcg_cut_10']
        recall[key] = res[key]['recall_10']
        pres[key] = res[key]['P_10']
        #print(res[key])
    
    filename = re.split('\.', sys.argv[1])
    with open((filename[0]+'_evaluated.json'), 'w') as f:
        json.dump(results, f)
        
    s = sum(results.values())/50
    r = sum(recall.values())/50
    p = sum(pres.values())/50
    if r+p != 0:
        f1 = (2*(r*p)/(r+p))
    else: 
        f1 = 0
    '''
    print("Average ndcg_cut_10: ", s)
    print("Average recall_10: ", r)
    print("Av. precision: ", p)
    print("F1-score: ",f1)
    '''    
    return s, r, p, f1


In [116]:
run = []
ndcg = []
rec = []
prec = []
f1s = []

for filename in os.listdir(output_dir):
    if filename.endswith(".txt"):
        #print(filename)
        s, r, p, f1 = evaluate_run((output_dir+"/{}").format(filename), 'touche2020-task2-relevance-withbaseline(1).qrels')
        run.append(filename)
        ndcg.append(s)
        rec.append(r)
        prec.append(p)
        f1s.append(f1)

In [117]:
df = pd.DataFrame({'run':run,'ndcg_cut10':ndcg, 'recall_10':rec, 'precision':prec, 'f1-score':f1s})

### All results

In [118]:
df.sort_values(by = ['ndcg_cut10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.444516,0.279059,0.426,0.337217
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
1,run_title_arg_syn.txt,0.379873,0.248743,0.368,0.296841
0,run_title_doc_arg.txt,0.375349,0.232472,0.35,0.279379
11,run_title_doc_arg_syn.txt,0.364893,0.227938,0.344,0.274193
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
4,run_doc_syn.txt,0.322411,0.200815,0.302,0.241226
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


In [119]:
df.sort_values(by = ['recall_10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.444516,0.279059,0.426,0.337217
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
1,run_title_arg_syn.txt,0.379873,0.248743,0.368,0.296841
0,run_title_doc_arg.txt,0.375349,0.232472,0.35,0.279379
11,run_title_doc_arg_syn.txt,0.364893,0.227938,0.344,0.274193
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
4,run_doc_syn.txt,0.322411,0.200815,0.302,0.241226
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


In [120]:
df.sort_values(by = ['precision'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.444516,0.279059,0.426,0.337217
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
1,run_title_arg_syn.txt,0.379873,0.248743,0.368,0.296841
0,run_title_doc_arg.txt,0.375349,0.232472,0.35,0.279379
11,run_title_doc_arg_syn.txt,0.364893,0.227938,0.344,0.274193
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
4,run_doc_syn.txt,0.322411,0.200815,0.302,0.241226
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


In [121]:
df.sort_values(by = ['f1-score'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.444516,0.279059,0.426,0.337217
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
1,run_title_arg_syn.txt,0.379873,0.248743,0.368,0.296841
0,run_title_doc_arg.txt,0.375349,0.232472,0.35,0.279379
11,run_title_doc_arg_syn.txt,0.364893,0.227938,0.344,0.274193
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
4,run_doc_syn.txt,0.322411,0.200815,0.302,0.241226
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


### Results without doc

In [122]:
df_p = df[~df.run.str.contains("doc")]

In [123]:
df_p.sort_values(by = ['ndcg_cut10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
1,run_title_arg_syn.txt,0.379873,0.248743,0.368,0.296841
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
8,run_args_syn.txt,0.271034,0.182184,0.262,0.214921
9,run_arg.txt,0.263433,0.179797,0.262,0.213251


In [124]:
df_p.sort_values(by = ['recall_10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
1,run_title_arg_syn.txt,0.379873,0.248743,0.368,0.296841
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
8,run_args_syn.txt,0.271034,0.182184,0.262,0.214921
9,run_arg.txt,0.263433,0.179797,0.262,0.213251


In [125]:
df_p.sort_values(by = ['precision'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
1,run_title_arg_syn.txt,0.379873,0.248743,0.368,0.296841
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
8,run_args_syn.txt,0.271034,0.182184,0.262,0.214921
9,run_arg.txt,0.263433,0.179797,0.262,0.213251


In [126]:
df_p.sort_values(by = ['f1-score'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
1,run_title_arg_syn.txt,0.379873,0.248743,0.368,0.296841
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
8,run_args_syn.txt,0.271034,0.182184,0.262,0.214921
9,run_arg.txt,0.263433,0.179797,0.262,0.213251


### Results for pipeline

In [127]:
df_p = df[~df.run.str.contains("arg")]
df_p = df_p[~df_p.run.str.contains("syn")]

In [128]:
df_p.sort_values(by = ['ndcg_cut10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
2,run_title.txt,0.359595,0.222892,0.342,0.269889
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


In [129]:
df_p.sort_values(by = ['recall_10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
2,run_title.txt,0.359595,0.222892,0.342,0.269889
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


In [130]:
df_p.sort_values(by = ['precision'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
2,run_title.txt,0.359595,0.222892,0.342,0.269889
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


In [131]:
df_p.sort_values(by = ['f1-score'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
2,run_title.txt,0.359595,0.222892,0.342,0.269889
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


### Results for synonyms

In [132]:
df_syn = df[~df.run.str.contains("arg")]

In [133]:
df_syn.sort_values(by = ['ndcg_cut10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.444516,0.279059,0.426,0.337217
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
4,run_doc_syn.txt,0.322411,0.200815,0.302,0.241226
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


In [134]:
df_syn.sort_values(by = ['recall_10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.444516,0.279059,0.426,0.337217
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
4,run_doc_syn.txt,0.322411,0.200815,0.302,0.241226
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


In [135]:
df_syn.sort_values(by = ['precision'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.444516,0.279059,0.426,0.337217
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
4,run_doc_syn.txt,0.322411,0.200815,0.302,0.241226
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


In [136]:
df_syn.sort_values(by = ['f1-score'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
10,run_title_doc_syn.txt,0.444516,0.279059,0.426,0.337217
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
2,run_title.txt,0.359595,0.222892,0.342,0.269889
13,run_title_syn.txt,0.350941,0.219608,0.34,0.266854
4,run_doc_syn.txt,0.322411,0.200815,0.302,0.241226
7,run_doc.txt,0.310644,0.190781,0.282,0.227591


### Results for arguments

In [137]:
df_arg = df[~df.run.str.contains("syn")]

In [138]:
df_arg.sort_values(by = ['ndcg_cut10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
0,run_title_doc_arg.txt,0.375349,0.232472,0.35,0.279379
2,run_title.txt,0.359595,0.222892,0.342,0.269889
7,run_doc.txt,0.310644,0.190781,0.282,0.227591
6,run_doc_arg.txt,0.283867,0.18876,0.274,0.223529
9,run_arg.txt,0.263433,0.179797,0.262,0.213251


In [139]:
df_arg.sort_values(by = ['recall_10'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
0,run_title_doc_arg.txt,0.375349,0.232472,0.35,0.279379
2,run_title.txt,0.359595,0.222892,0.342,0.269889
7,run_doc.txt,0.310644,0.190781,0.282,0.227591
6,run_doc_arg.txt,0.283867,0.18876,0.274,0.223529
9,run_arg.txt,0.263433,0.179797,0.262,0.213251


In [140]:
df_arg.sort_values(by = ['precision'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
0,run_title_doc_arg.txt,0.375349,0.232472,0.35,0.279379
2,run_title.txt,0.359595,0.222892,0.342,0.269889
7,run_doc.txt,0.310644,0.190781,0.282,0.227591
6,run_doc_arg.txt,0.283867,0.18876,0.274,0.223529
9,run_arg.txt,0.263433,0.179797,0.262,0.213251


In [141]:
df_arg.sort_values(by = ['f1-score'], ascending = False)

Unnamed: 0,run,ndcg_cut10,recall_10,precision,f1-score
3,run_title_doc.txt,0.444189,0.270802,0.42,0.329289
12,run_title_arg.txt,0.384095,0.251524,0.374,0.300772
0,run_title_doc_arg.txt,0.375349,0.232472,0.35,0.279379
2,run_title.txt,0.359595,0.222892,0.342,0.269889
7,run_doc.txt,0.310644,0.190781,0.282,0.227591
6,run_doc_arg.txt,0.283867,0.18876,0.274,0.223529
9,run_arg.txt,0.263433,0.179797,0.262,0.213251
