In [18]:
import os
import pandas as pd
import re
from functools import reduce
import glob

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors

import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
#from statsmodels.stats.multicomp import multipletests

from pptx import Presentation
from pptx.util import Inches, Pt

os.chdir("/Users/aklasnja/2021/")

dslist = ['gov2','robust04','cw09b','cw12b13', 'msmarco']

#bias calc
at_ranklist = [5, 10, 20, 30, 50]

#a reference for model classes
ref_models = pd.read_csv('reference_tables/ref_models.csv')

ref = pd.read_csv('reference_tables/ref_queries.csv')

#liwc variables
sel = ['female', 'male',
       'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', \
       'social', 'family', 'friend', \
       'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', \
       'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', \
       'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', \
       'focuspast', 'focuspresent', 'focusfuture', \
       'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death'] 

In [19]:
# join refs

refs = list()
for ds in dslist:
    refs.append(pd.read_csv(os.path.join('retrievals',ds,ds+'_ref.csv')).assign(dataset = ds).filter(regex=r'^(?!Unnamed).*$'))
refs = pd.concat(refs)
refs.to_csv(os.path.join('retrievals','ref.csv'), index=True, header=True, sep=',')

In [2]:
#1 join agg per query 
dsel = ['dataset','qid','method','query_no', 'qid_text', 'query_text']

bias = list()
liwc = list()
for ds in dslist:
    xbias = pd.read_csv(os.path.join('retrievals',ds,'r_bias_qe.csv')).assign(dataset = ds).filter(regex=r'^(?!Unnamed|level|index|class).*$')
    print('{0} - bias - unique qid: {1} - unique query: {2}'.format(ds, xbias.groupby(['dataset','qid']).ngroups, xbias.groupby(['dataset','qid','method']).ngroups))
    bias.append(xbias)
    # key = query
    xliwc = pd.read_csv(os.path.join('retrievals',ds,'r_liwc_qe.csv')).assign(dataset = ds).filter(regex=r'^(?!Unnamed|level|index|class).*$')
    print('{0} - liwc - unique qid: {1} - unique query: {2}'.format(ds, xliwc.groupby(['dataset','qid']).ngroups, xliwc.groupby(['dataset','qid','method']).ngroups))
    # key = doc -> query 
    xliwc_df = list()
    for n in at_ranklist:
        xxliwc = xliwc.query('rank <= @n').set_index(dsel).filter(sel) #----------qid method index
        xxliwc = xxliwc.groupby(xxliwc.index).mean()  
        xxliwc.columns = xxliwc.columns + '_' + str(n)
        xxliwc.index = pd.MultiIndex.from_tuples(xxliwc.index, names=dsel)
        xliwc_df.append(xxliwc.reset_index(drop=False))
    liwc.append(reduce(lambda df1, df2: pd.merge(df1, df2, on=dsel), xliwc_df))
bias = pd.concat(bias)
print('bias - unique qid: {0} - unique query: {1}'.format(bias.groupby(['dataset','qid']).ngroups, bias.groupby(['dataset','qid','method']).ngroups))
liwc = pd.concat(liwc)
print('liwc - unique qid: {0} - unique query: {1}'.format(liwc.groupby(['dataset','qid']).ngroups, liwc.groupby(['dataset','qid','method']).ngroups))
bias = bias.set_index(dsel, drop=True)
r_qe = liwc.set_index(dsel, drop=True).join(bias, how='outer').reset_index(drop=False)
r_qe = r_qe.set_index(['method'], drop=True).join(ref_models.set_index(['method']), how='left').reset_index(drop=False)
r_qe.to_csv(os.path.join('retrievals','r_qe.csv'), index=True, header=True, sep=',')

gov2 - bias - unique qid: 111 - unique query: 580
gov2 - liwc - unique qid: 111 - unique query: 580
robust04 - bias - unique qid: 225 - unique query: 1611
robust04 - liwc - unique qid: 225 - unique query: 1611
cw09b - bias - unique qid: 125 - unique query: 486
cw09b - liwc - unique qid: 125 - unique query: 486
cw12b13 - bias - unique qid: 64 - unique query: 284
cw12b13 - liwc - unique qid: 64 - unique query: 284
msmarco - bias - unique qid: 5496 - unique query: 40641
msmarco - liwc - unique qid: 5496 - unique query: 40641
bias - unique qid: 6021 - unique query: 43602
liwc - unique qid: 6021 - unique query: 43602


In [83]:
#2 join ratio per query

dsel = ['dataset','qid','method']

bias = list()
liwc = list()
for ds in dslist:
    xbias = pd.read_csv(os.path.join('retrievals',ds,'ratio_bias_qe.csv')).assign(dataset = ds).filter(regex=r'^(?!Unnamed).*$')
    bias.append(xbias)
    print('{0} - bias - unique qid: {1} - unique query: {2}'.format(ds, xbias.groupby(['dataset','qid']).ngroups, xbias.groupby(['dataset','qid','method']).ngroups))
    xliwc = pd.read_csv(os.path.join('retrievals',ds,'ratio_liwc_qe.csv')).assign(dataset = ds).filter(regex=r'^(?!Unnamed).*$')
    print('{0} - liwc - unique qid: {1} - unique query: {2}'.format(ds, xliwc.groupby(['dataset','qid']).ngroups, xliwc.groupby(['dataset','qid','method']).ngroups))
    liwc.append(xliwc)
bias = pd.concat(bias)
print('bias - unique qid: {0} - unique query: {1}'.format(bias.groupby(['dataset','qid']).ngroups, bias.groupby(['dataset','qid','method']).ngroups))
liwc = pd.concat(liwc)
print('liwc - unique qid: {0} - unique query: {1}'.format(liwc.groupby(['dataset','qid']).ngroups, liwc.groupby(['dataset','qid','method']).ngroups))

bias = bias.set_index(dsel, drop=True)
ratio_qe = liwc.set_index(dsel, drop=True).join(bias, how='outer').reset_index(drop=False)
ratio_qe = ratio_qe.set_index(['method'], drop=False).join(ref_models.set_index(['method'], drop=True), how='left').reset_index(drop=True)
ratio_qe.set_index(dsel, drop=False).to_csv(os.path.join('retrievals','ratio_qe.csv'), index=True, header=True, sep=',')


gov2 - bias - unique qid: 111 - unique query: 580
gov2 - liwc - unique qid: 111 - unique query: 580
robust04 - bias - unique qid: 225 - unique query: 1611
robust04 - liwc - unique qid: 225 - unique query: 1611
cw09b - bias - unique qid: 125 - unique query: 486
cw09b - liwc - unique qid: 125 - unique query: 486
cw12b13 - bias - unique qid: 64 - unique query: 284
cw12b13 - liwc - unique qid: 64 - unique query: 284
msmarco - bias - unique qid: 5496 - unique query: 40641
msmarco - liwc - unique qid: 5496 - unique query: 40641
bias - unique qid: 6021 - unique query: 43602
liwc - unique qid: 6021 - unique query: 43602


In [39]:
#r_qe.set_index(['dataset','qid','method'])
#ratio_qe.set_index(['dataset','qid','method'])
#ck = r_qe.loc[set(r_qe.index).symmetric_difference(set(ratio_qe.index))].dropna()

In [27]:
#1 just qrels for msmarco

dslist = ['msmarco']

dsel = ['dataset','qid','method']

liwc = list()
for ds in dslist:
    # key = query
    xliwc = pd.read_csv(os.path.join('retrievals',ds,'r_liwc_qr.csv')).assign(dataset = ds, method = "qrels").filter(regex=r'^(?!Unnamed|level|index|class).*$')
    print('{0} - liwc - unique qid: {1} - unique query: {2}'.format(ds, xliwc.groupby(['dataset','qid']).ngroups, xliwc.groupby(['dataset','qid','method']).ngroups))
    # key = doc -> query 
    xliwc_df = list()
    for n in at_ranklist:
        xxliwc = xliwc.set_index(dsel).filter(sel) #----------qid method index
        xxliwc = xxliwc.groupby(xxliwc.index).mean()  
        xxliwc.columns = xxliwc.columns + '_' + str(n)
        xxliwc.index = pd.MultiIndex.from_tuples(xxliwc.index, names=dsel)
        xliwc_df.append(xxliwc.reset_index(drop=False))
    liwc.append(reduce(lambda df1, df2: pd.merge(df1, df2, on=dsel), xliwc_df))
r_qr = pd.concat(liwc)
r_qr.to_csv(os.path.join('retrievals','r_qr.csv'), index=True, header=True, sep=',')


msmarco - liwc - unique qid: 5496 - unique query: 5496
