In [1]:
import pandas as pd
import nltk

## Statistics of knowledgebase

In [14]:
df_documents = pd.read_json('../data/crawls/20250317/data.jsonl', lines=True)
df_documents.head()

Unnamed: 0,url,status,html,content,og
0,https://www.uni-marburg.de/en/studying/degree-...,200,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...",###### Main Content\n\n# Data Science (Master ...,{'og:site_name': 'Philipps-Universität Marburg...
1,https://www.mathematik.uni-marburg.de/modulhan...,200,"\n\n\n\n<!DOCTYPE html>\n<html xmlns:xlink=""ht...",###### Main content\n\n# **M.Sc. Data Science*...,{'og:site_name': 'Philipps-Universität Marburg...
2,https://studentenwerk-marburg.de/en/living/,200,"<!DOCTYPE html>\n<html lang=""en-US"" class=""no-...",Welcome to Marburg! The Studentenwerk Marburg ...,{}
3,https://www.uni-marburg.de/en/studying/admissi...,200,"<!DOCTYPE html>\n<html xmlns=""http://www.w3.or...",# Admissions\n\nFor a successful start to your...,{'og:site_name': 'Philipps-Universität Marburg...
4,https://studentenwerk-marburg.de/en/living/faq/,200,"<!DOCTYPE html>\n<html lang=""en-US"" class=""no-...",# FAQ\n\n**How expensive is an apartment / sin...,{}


In [15]:
len(df_documents)

249

In [17]:
df_documents['content'].apply(nltk.word_tokenize).apply(len).describe()

count     249.000000
mean      712.955823
std       486.027713
min        13.000000
25%       479.000000
50%       688.000000
75%       816.000000
max      3678.000000
Name: content, dtype: float64

## Statistics of test queries

all / answerable / unanswerable
- n
- question length
- answer length
- number of sources

In [6]:
df_queries = pd.read_json('../data/queries/20250317-email.json')
df_queries['question_length'] = df_queries['question'].apply(nltk.word_tokenize).apply(len)
df_queries['intent_length'] = df_queries['intent'].apply(nltk.word_tokenize).apply(len)
df_queries['reference_answer_length'] = df_queries['reference_answer'].apply(nltk.word_tokenize).apply(len)
df_queries['sources_length'] = df_queries['sources'].apply(len)
df_queries['answerable'] = df_queries['sources'].apply(len) >= 1
df_queries.head()

Unnamed: 0,id,category,intent,question,reference_answer,sources,question_length,intent_length,reference_answer_length,sources_length,answerable
0,faq-0000,01 - Admission Criteria,"Given my background, will I be accepted?",I want to inquire about the acceptance criteri...,Your bachelor degree needs to be in Data Scien...,[https://www.uni-marburg.de/en/studying/degree...,103,9,81,1,True
1,faq-0001,01 - Admission Criteria,"Given my background, will I be accepted?",I recently completed my undergraduate degree i...,Your bachelor degree needs to be in Data Scien...,[https://www.uni-marburg.de/en/studying/degree...,64,9,81,1,True
2,faq-0002,01 - Admission Criteria,"Given my background, will I be accepted?",My cgpa is 6.4 but while I pursuing my BCA I ...,Your bachelor degree needs to be in Data Scien...,[https://www.uni-marburg.de/en/studying/degree...,60,9,81,1,True
3,faq-0003,01 - Admission Criteria,"Given my background, will I be accepted?",I have completed my Bachelor's degree in Compu...,Your bachelor degree needs to be in Data Scien...,[https://www.uni-marburg.de/en/studying/degree...,27,9,81,1,True
4,faq-0004,01 - Admission Criteria,"Given my background, will I be accepted?",I would like to inquire about the eligibility ...,Your bachelor degree needs to be in Data Scien...,[https://www.uni-marburg.de/en/studying/degree...,27,9,81,1,True


In [10]:
stats = df_queries.groupby('answerable').agg(
    n=('answerable', 'size'),
    intent=('intent', 'nunique'),
    avg_intent_length=('intent_length', 'mean'),
    avg_question_length=('question_length', 'mean'),
    avg_answer_length=('reference_answer_length', 'mean'),
    avg_sources_length=('sources_length', 'mean'),
)
stats.loc['all'] = {
    'n': len(df_queries),
    'intent': df_queries['intent'].nunique(),
    'avg_intent_length': df_queries['intent_length'].mean(),
    'avg_question_length': df_queries['question_length'].mean(),
    'avg_answer_length': df_queries['reference_answer_length'].mean(),
    'avg_sources_length': df_queries['sources_length'].mean(),
}
stats = stats.rename({
    'all': 'All',
    True: 'Answerable',
    False: 'Unanswerable',
}, axis=0)
stats = stats.rename({
    'n': '# of Questions',
    'intent': '# of FAQs',
    'avg_intent_length': 'Avg. FAQ Length',
    'avg_question_length': 'Avg. Question length',
    'avg_answer_length': 'Avg. Answer length',
    'avg_sources_length': 'Avg. # of Sources',
}, axis=1)
stats = stats.loc[['All', 'Answerable', 'Unanswerable']]
stats = stats.rename_axis(None, axis=1)
stats = stats.rename_axis(None, axis=0)
stats = stats.T
stats = stats.round(1)
stats = stats.astype(str)
stats.loc['# of Questions'] = stats.loc['# of Questions'].apply(lambda x: str(int(float(x))))
stats.loc['# of FAQs'] = stats.loc['# of FAQs'].apply(lambda x: str(int(float(x))))
stats

Unnamed: 0,All,Answerable,Unanswerable
# of Questions,95.0,76.0,19.0
# of FAQs,36.0,25.0,11.0
Avg. FAQ Length,9.7,8.9,12.8
Avg. Question length,43.3,40.5,54.3
Avg. Answer length,45.5,46.0,43.6
Avg. # of Sources,1.0,1.3,0.0


In [12]:
tex = stats \
    .style \
    .format_index("\\textbf{{{}}}", escape="latex", axis=1) \
    .format_index(escape="latex", axis=0) \
    .to_latex(hrules=True, position='t', caption='caption', label='tab:')
print(tex)

\begin{table}[t]
\caption{caption}
\label{tab:}
\begin{tabular}{llll}
\toprule
 & \textbf{All} & \textbf{Answerable} & \textbf{Unanswerable} \\
\midrule
\# of Questions & 95 & 76 & 19 \\
\# of FAQs & 36 & 25 & 11 \\
Avg. FAQ Length & 9.7 & 8.9 & 12.8 \\
Avg. Question length & 43.3 & 40.5 & 54.3 \\
Avg. Answer length & 45.5 & 46.0 & 43.6 \\
Avg. \# of Sources & 1.0 & 1.3 & 0.0 \\
\bottomrule
\end{tabular}
\end{table}



In [5]:
df_queries['intent'].value_counts()

intent
What are the application deadlines?                                                                                                                  15
Given my background, will I be accepted?                                                                                                             11
What are the language requirements and do I meet them?                                                                                                6
How do I apply?                                                                                                                                       4
Do I need to submit the IELTS score or is a proof that the bachelor programme was taught in English enough?                                           4
How does the curriculum look like?                                                                                                                    3
Is the Master Data Science given entirely in English?                            