In [None]:
import numpy as np
import pandas as pd
import os
import en_core_web_sm
from spacy import displacy
import matplotlib.pyplot as plt
from scipy.spatial import distance
from statistics import mode
import re
from scipy.interpolate import interp1d

In [None]:
filepath = os.getcwd() + "/Data/"

In [None]:
# WDAqua Data

wdaqua = pd.ExcelFile(filepath + 'Evaluation Section - WDAqua-Core1.xlsx')
wdaqua_complex = pd.read_excel(wdaqua, 'ComplexQuestions', header=None).iloc[:,[0,5]]
wdaqua_graph = pd.read_excel(wdaqua, 'GraphQuestions', header=None).iloc[:5166,[0,5]]
wdaqua_web = pd.read_excel(wdaqua, 'WebQuestions', header=None).iloc[:,[0,5]]
wdaqua_temp = pd.read_excel(wdaqua, 'TempQuestions', header=None).iloc[:,[0,5]]
wdaqua_comqa = pd.read_excel(wdaqua, 'ComQA', header=None).iloc[:,[0,5]]
wdaqua_qald1 = pd.read_excel(wdaqua, 'QALD-1', header=None).iloc[:,[0,5]]
wdaqua_qald2 = pd.read_excel(wdaqua, 'QALD-2', header=None).iloc[:,[0,5]]
wdaqua_qald3 = pd.read_excel(wdaqua, 'QALD-3', header=None).iloc[:,[0,5]]
wdaqua_qald4 = pd.read_excel(wdaqua, 'QALD-4', header=None).iloc[:,[0,5]]
wdaqua_qald5 = pd.read_excel(wdaqua, 'QALD-5', header=None).iloc[:,[0,5]]
wdaqua_qald6 = pd.read_excel(wdaqua, 'QALD-6', header=None).iloc[:,[0,5]]
wdaqua_qald7 = pd.read_excel(wdaqua, 'QALD-7', header=None).iloc[:,[0,5]]
wdaqua_qald8 = pd.read_excel(wdaqua, 'QALD-8', header=None).iloc[:,[0,5]]
wdaqua_qald9 = pd.read_excel(wdaqua, 'QALD-9', header=None).iloc[:,[0,5]]
wdaqua_lcquad = pd.read_excel(wdaqua, 'LC-QUAD', header=None).iloc[:,[0,5]]

In [None]:
# Merge all benchmarks
wdaqua_all = pd.concat([wdaqua_complex, wdaqua_web, wdaqua_graph, wdaqua_temp, wdaqua_comqa, wdaqua_lcquad, wdaqua_qald1, wdaqua_qald2, wdaqua_qald3, wdaqua_qald4, wdaqua_qald5, wdaqua_qald6, wdaqua_qald7, wdaqua_qald8, wdaqua_qald9])
                       # keys=['complex', 'web', 'graph', 'temp', 'comqa', 'lcquad', 'qald1', 'qald2', 'qald3', 'qald4', 'qald5', 'qald6', 'qald7', 'qald8', 'qald9']
wdaqua_all.columns = ['Ques', 'Ans']
wdaqua_all = wdaqua_all[['Ans', 'Ques']]
wdaqua_all = wdaqua_all.drop_duplicates(subset="Ques").reset_index(drop=True)

# Clean up data to have 'Ans' either being correct (1) or incorrect (0)
for i in range(len(wdaqua_all)):
    if wdaqua_all.iloc[i,0] == 'F1 : 1.0':
        wdaqua_all.iloc[i,0] = 1
    else:
        wdaqua_all.iloc[i,0] = 0

In [None]:
# Categorize type of question
def qType(df):
    df.insert(1, "Type", "")
    for i in range(len(df)):
        string = df.iloc[i,2].lower()
        if string.startswith('wh') or string.startswith('wh', string.find(" ") + 1):
            df.iloc[i,1] = "wh"
        elif string.startswith('how') or string.startswith('how', string.find(" ") + 1):
            df.iloc[i,1] = "how"
        elif string.startswith('is ') or string.startswith('was') or string.startswith('are ') or \
        string.startswith('were') or string.startswith('do ') or string.startswith('does') or \
        string.startswith('did'):
            df.iloc[i,1] = "yes/no"
        elif string.startswith('name') or string.startswith('list') or string.startswith('find') or \
        string.startswith('identify') or string.startswith('search') or string.startswith('locate') or \
        string.startswith('enumerate') or string.startswith('look for') or string.startswith('return') or \
        string.startswith('give') or string.startswith('show') or string.startswith('tell') or \
        string.startswith('can you') or string.startswith('could you') or string.startswith('describe') or \
        string.startswith('make') or string.startswith('please') or string.startswith('count') or \
        string.startswith('state'):
            df.iloc[i,1] = "request"
        else:
            df.iloc[i,1] = "topicalized"
    return df

In [None]:
wdaqua_all = qType(wdaqua_all)

In [None]:
wdaqua_all

In [None]:
# Divide the questions according to their respective types of questions
wdaqua_df_wh = wdaqua_all[wdaqua_all["Type"] == "wh"].reset_index(drop=True)
wdaqua_df_how = wdaqua_all[wdaqua_all["Type"] == "how"].reset_index(drop=True)
wdaqua_df_yn = wdaqua_all[wdaqua_all["Type"] == "yes/no"].reset_index(drop=True)
wdaqua_df_req = wdaqua_all[wdaqua_all["Type"] == "request"].reset_index(drop=True)
wdaqua_df_top = wdaqua_all[wdaqua_all["Type"] == "topicalized"].reset_index(drop=True)

In [None]:
wdaqua_df_how

In [None]:
# POS Tagging

def posFreq(file, method):
    # Generate POS Tagging for each question
    nlp = en_core_web_sm.load()
    pos = []
    for i in range(len(file)):
        ques_pos = []
        doc = nlp(file.iloc[i,2]) # the questions are almost always in the third column in the benchmarks used,
        for token in doc:               # so if not, may need to add additional padding to ensure index consistency
            if method == "UPOS":
                ques_pos.append(token.pos_)
            elif method == "Detailed":
                ques_pos.append(token.tag_)
        pos.append(ques_pos)
    # Convert to dataframe for easier usage
    pos_df = pd.DataFrame(pos, file.index)
    
    # Find the frequency of tags in each question
    freq = []
    for i in range(len(pos_df)):
        freq.append(pos_df.iloc[i,:].value_counts())
    freq_df = pd.DataFrame(freq, file.index)
    freq_df = freq_df.fillna(0)
    freq_df.insert (0, "Info", file.iloc[:,0])
    freq_df.insert (1, "Type", file.iloc[:,1])
    
    return pos, freq_df

In [None]:
wdaqua_upos, wdaqua_upos_freq_df = posFreq(wdaqua_all, "UPOS")

In [None]:
wdaqua_upos_freq_df

In [None]:
# Find the k-Nearest Neighbors of each question in its own type of question
def knnQType(df, k):
    ans = df["Info"].reset_index(drop=True)
    eucDist = distance.cdist(df.iloc[:,2:], df.iloc[:,2:], 'euclidean')
    ans_df = []
    for i in range(len(eucDist)):
        knn = []
        # find the answers of first k questions that is closest to the i-th question
        knn_i = ans[np.delete(np.argpartition(eucDist[i], k), np.where(np.argpartition(eucDist[i], k) == i))[:k]]
        knn.append(knn_i)
        knn.append(mode(knn_i))
        knn.append(ans[i])
        ans_df.append(knn)
    ans_df = pd.DataFrame(ans_df)
    return ans_df

In [None]:
# Find 5 nearest neighbors of each question with the same question type
wdaqua_ans_how = knnQType(wdaqua_upos_freq_df[wdaqua_upos_freq_df["Type"] == "how"], 5)
wdaqua_ans_yn = knnQType(wdaqua_upos_freq_df[wdaqua_upos_freq_df["Type"] == "yes/no"], 5)
wdaqua_ans_req = knnQType(wdaqua_upos_freq_df[wdaqua_upos_freq_df["Type"] == "request"], 5)
wdaqua_ans_top = knnQType(wdaqua_upos_freq_df[wdaqua_upos_freq_df["Type"] == "topicalized"], 5)

In [None]:
wdaqua_ans_how

In [None]:
wdaqua_detailed, wdaqua_detailed_freq_df = posFreq(wdaqua_all, "Detailed")

In [None]:
wdaqua_ans_how_detailed = knnQType(wdaqua_detailed_freq_df[wdaqua_detailed_freq_df["Type"] == "how"], 5)
wdaqua_ans_yn_detailed = knnQType(wdaqua_detailed_freq_df[wdaqua_detailed_freq_df["Type"] == "yes/no"], 5)
wdaqua_ans_req_detailed = knnQType(wdaqua_detailed_freq_df[wdaqua_detailed_freq_df["Type"] == "request"], 5)
wdaqua_ans_top_detailed = knnQType(wdaqua_detailed_freq_df[wdaqua_detailed_freq_df["Type"] == "topicalized"], 5)

In [None]:
# Dependency Parse Tree

def parseTree(file):
    # Generate Parse Tree for each question
    nlp = en_core_web_sm.load()
    prstree = []
    for i in range(len(file)):
        ques_prstree = []
        doc = nlp(file.iloc[i,2])
        #displacy.render(doc, style="dep")
        for token in doc:
            ques_prstree.append(token.dep_)
        prstree.append(ques_prstree)
    prstree_df = pd.DataFrame(prstree, file.index)
    # Convert all instances to categorical, then represent using ints
    prstree_df = prstree_df.astype('category')
    prstree_df = prstree_df.apply(lambda x: x.cat.codes)
    prstree_df.insert (0, "Info", file.iloc[:,0])
    prstree_df.insert (1, "Type", file.iloc[:,1])
    return prstree_df

In [None]:
wdaqua_prstree_freq_df = parseTree(wdaqua_all)

In [None]:
wdaqua_ans_how_prstree = knnQType(wdaqua_prstree_freq_df[wdaqua_prstree_freq_df["Type"] == "how"], 5)
wdaqua_ans_yn_prstree = knnQType(wdaqua_prstree_freq_df[wdaqua_prstree_freq_df["Type"] == "yes/no"], 5)
wdaqua_ans_req_prstree = knnQType(wdaqua_prstree_freq_df[wdaqua_prstree_freq_df["Type"] == "request"], 5)
wdaqua_ans_top_prstree = knnQType(wdaqua_prstree_freq_df[wdaqua_prstree_freq_df["Type"] == "topicalized"], 5)

In [None]:
# Find the specific wh-type in wh-questions (what, when, where, which, who, whose, and whom, excluding why)
def specWh(df):
    for i in range(len(df)):
        string = df.iloc[i,2].lower()
        if string.startswith('what') or string.startswith('what', string.find(" ") + 1):
            df.iloc[i,1] = "what"
        elif string.startswith('when') or string.startswith('when', string.find(" ") + 1):
            df.iloc[i,1] = "when"
        elif string.startswith('where') or string.startswith('where', string.find(" ") + 1):
            df.iloc[i,1] = "where"
        elif string.startswith('which') or string.startswith('which', string.find(" ") + 1):
            df.iloc[i,1] = "which"
        elif string.startswith('who ') or string.startswith('who ', string.find(" ") + 1):
            df.iloc[i,1] = "who"
        elif string.startswith('whom') or string.startswith('whom', string.find(" ") + 1):
            df.iloc[i,1] = "whom"
        elif string.startswith('whose') or string.startswith('whose', string.find(" ") + 1):
            df.iloc[i,1] = "whose"
    df = df.drop(df[df['Type'] == 'wh'].index).reset_index(drop=True)
    return df

In [None]:
wdaqua_df_wh = specWh(wdaqua_df_wh)

In [None]:
wdaqua_df_wh

In [None]:
# Add a dummy wh- variable into the PoS tag frequency dataframe, so that e.g. the value in the 'what' column 
# would be 1 if the question is a what-question, and 0 otherwise.
def whDummy(freq_df):
    freq_df["what"] = 0
    freq_df["when"] = 0
    freq_df["where"] = 0
    freq_df["which"] = 0
    freq_df["who"] = 0
    freq_df["whom"] = 0
    freq_df["whose"] = 0
    for i in range(len(freq_df)):
        freq_df[freq_df.iloc[i,1]].iloc[i] = 1
    return freq_df

In [None]:
wdaqua_wh_upos, wdaqua_wh_upos_freq_df = posFreq(wdaqua_df_wh, "UPOS")

In [None]:
wdaqua_wh_upos_freq_df = whDummy(wdaqua_wh_upos_freq_df)

In [None]:
wdaqua_wh_upos_freq_df

In [None]:
wdaqua_ans_wh = knnQType(wdaqua_wh_upos_freq_df, 5)

In [None]:
wdaqua_wh_detailed, wdaqua_wh_detailed_freq_df = posFreq(wdaqua_df_wh, "Detailed")

In [None]:
wdaqua_wh_detailed_freq_df = whDummy(wdaqua_wh_detailed_freq_df)

In [None]:
wdaqua_ans_wh_detailed = knnQType(wdaqua_wh_detailed_freq_df, 5)

In [None]:
wdaqua_ans_wh_detailed.iloc[1237,0]

In [None]:
print(wdaqua_df_wh.iloc[1237,2])
print(wdaqua_df_wh.iloc[433,2])
print(wdaqua_df_wh.iloc[308,2])
print(wdaqua_df_wh.iloc[197,2])
print(wdaqua_df_wh.iloc[451,2])
print(wdaqua_df_wh.iloc[177,2])

In [None]:
wdaqua_wh_prstree_freq_df = parseTree(wdaqua_df_wh)

In [None]:
wdaqua_wh_prstree_freq_df = whDummy(wdaqua_wh_prstree_freq_df)

In [None]:
wdaqua_ans_wh_prstree = knnQType(wdaqua_wh_prstree_freq_df, 5)

In [None]:
wdaqua_qaldAll = pd.concat([wdaqua_qald1, wdaqua_qald2, wdaqua_qald3, wdaqua_qald4, wdaqua_qald5, wdaqua_qald6, wdaqua_qald7, wdaqua_qald8, wdaqua_qald9])
wdaqua_qaldAll.columns = ['Ques', 'Ans']
wdaqua_qaldAll = wdaqua_qaldAll.drop_duplicates(subset="Ques").reset_index(drop=True)
wdaqua_qaldAll = wdaqua_qaldAll.drop(wdaqua_qaldAll.index[395]).reset_index(drop=True)

In [None]:
wdaqua_qaldAll

In [None]:
free917 = pd.read_csv(filepath + "Evaluation Section - WDAqua-Core1 - Fee917.csv", header=None)

In [None]:
simpledb = pd.read_csv(filepath + 'SimpleQuestionsDB-OnlyNLQs.csv', header=None).iloc[:-1, :]

In [None]:
simple = pd.read_excel(filepath + 'SimpleQuestions.xlsx', header=None).iloc[:, 3:]

In [None]:
# Question Length Distribution

def quesLen(df):
    df['Length'] = 0
    for i in range(len(df)):
        df['Length'][i] = len(re.findall(r'\w+', df.iloc[i,0]))
    return df

In [None]:
wdaqua_qaldAll = quesLen(wdaqua_qaldAll)
wdaqua_web = quesLen(wdaqua_web)
wdaqua_graph = quesLen(wdaqua_graph)
wdaqua_lcquad = quesLen(wdaqua_lcquad)
wdaqua_temp = quesLen(wdaqua_temp)
wdaqua_complex = quesLen(wdaqua_complex)
wdaqua_comqa = quesLen(wdaqua_comqa)
free917 = quesLen(free917)
simple = quesLen(simple)
simpledb = quesLen(simpledb)

In [None]:
x = wdaqua_qaldAll['Length'].value_counts(normalize=True).sort_index().index
y = wdaqua_qaldAll['Length'].value_counts(normalize=True).sort_index() * 100     # 90%: 12
f = interp1d(x, y, kind='cubic')
xnew = np.linspace(2, 21, num=41, endpoint=True)

x1 = wdaqua_web['Length'].value_counts(normalize=True).sort_index().index
y1 = wdaqua_web['Length'].value_counts(normalize=True).sort_index() * 100     # 10
f1 = interp1d(x1, y1, kind='cubic')
xnew1 = np.linspace(3, 15, num=41, endpoint=True)

x2 = wdaqua_graph['Length'].value_counts(normalize=True).sort_index().index
y2 = wdaqua_graph['Length'].value_counts(normalize=True).sort_index() * 100     # 14
f2 = interp1d(x2, y2, kind='cubic')
xnew2 = np.linspace(2, 25, num=41, endpoint=True)

x3 = wdaqua_lcquad['Length'].value_counts(normalize=True).sort_index().index
y3 = wdaqua_lcquad['Length'].value_counts(normalize=True).sort_index() * 100     # 11
f3 = interp1d(x3, y3, kind='cubic')
xnew3 = np.linspace(2, 26, num=41, endpoint=True)

x4 = wdaqua_temp['Length'].value_counts(normalize=True).sort_index().index
y4 = wdaqua_temp['Length'].value_counts(normalize=True).sort_index() * 100     # 11
f4 = interp1d(x4, y4, kind='cubic')
xnew4 = np.linspace(4, 15, num=41, endpoint=True)

x5 = wdaqua_complex['Length'].value_counts(normalize=True).sort_index().index
y5 = wdaqua_complex['Length'].value_counts(normalize=True).sort_index() * 100     # 12
f5 = interp1d(x5, y5, kind='cubic')
xnew5 = np.linspace(5, 19, num=41, endpoint=True)

x6 = wdaqua_comqa['Length'].value_counts(normalize=True).sort_index().index
y6 = wdaqua_comqa['Length'].value_counts(normalize=True).sort_index() * 100     # 11
f6 = interp1d(x6, y6, kind='cubic')
xnew6 = np.linspace(2, 21, num=41, endpoint=True)

x7 = free917['Length'].value_counts(normalize=True).sort_index().index
y7 = free917['Length'].value_counts(normalize=True).sort_index() * 100     # 11
f7 = interp1d(x7, y7, kind='cubic')
xnew7 = np.linspace(3, 18, num=41, endpoint=True)

x8 = simple['Length'].value_counts(normalize=True).sort_index().index
y8 = simple['Length'].value_counts(normalize=True).sort_index() * 100     # 11
f8 = interp1d(x8, y8, kind='cubic')
xnew8 = np.linspace(1, 34, num=41, endpoint=True)

x9 = simpledb['Length'].value_counts(normalize=True).sort_index().index
y9 = simpledb['Length'].value_counts(normalize=True).sort_index() * 100     # 10
f9 = interp1d(x9, y9, kind='cubic')
xnew9 = np.linspace(1, 34, num=41, endpoint=True)

In [None]:
fig, ax = plt.subplots()

ax.scatter(x, y, c='b', marker='.')
ax.plot(xnew, f(xnew), '-b')
ax.plot([], [], '-.', color='blue', label = 'QALD')

ax.scatter(x1, y1, c='g', marker='+')
ax.plot(xnew1, f1(xnew1), '--g')
ax.plot([], [], '--+', color='green', label = 'Web')

ax.scatter(x2, y2, c='r', marker='o')
ax.plot(xnew2, f2(xnew2), '-r')
ax.plot([], [], '-o', color='red', label = 'Graph')

ax.scatter(x3, y3, c='c', marker='x')
ax.plot(xnew3, f3(xnew3), '--c')
ax.plot([], [], '--x', color='cyan', label = 'LC-QuAD')

ax.scatter(x4, y4, c='m', marker='v')
ax.plot(xnew4, f4(xnew4), '-m')
ax.plot([], [], '-v', color='magenta', label = 'Temp')

ax.scatter(x5, y5, c='y', marker='|')
ax.plot(xnew5, f5(xnew5), '--y')
ax.plot([], [], '--|', color='yellow', label = 'Complex')

ax.scatter(x6, y6, c='k', marker='d')
ax.plot(xnew6, f6(xnew6), '-k')
ax.plot([], [], '-d', color='black', label = 'ComQA')

# ax.scatter(x7, y7, c='r', marker='1')
# ax.plot(xnew7, f7(xnew7), '--r')
# ax.plot([], [], '--1', color='red', label = 'Free917')

ax.scatter(x8, y8, c='g', marker='s')
ax.plot(xnew8, f8(xnew8), '-g')
ax.plot([], [], '-s', color='green', label = 'Simple')

ax.scatter(x9, y9, c='m', marker='*')
ax.plot(xnew9, f9(xnew9), '--m')
ax.plot([], [], '--*', color='magenta', label = 'SimpleDB')

plt.xlabel("No. of words in question", fontsize=14)
plt.ylabel("Percentage of questions", fontsize=14)
plt.legend(loc='best')
plt.show()