In [1]:
import pandas as pd
from convokit import Corpus, download

## Parliament corpus

In [2]:
# Download corpus and get dataframe
parliament_corpus = Corpus(filename=download("parliament-corpus"))
parliament_us = parliament_corpus.get_utterances_dataframe()

Dataset already exists at C:\Users\polin\.convokit\downloads\parliament-corpus


In [17]:
# Get useful columns
parliament_us = parliament_us[['text', 'meta.is_question', 'meta.is_answer', 'meta.pair_idx']]
parliament_us['idx'] = parliament_us.index

# Get dfs for Qs and As
parliament_q = parliament_us.groupby('meta.pair_idx').nth(0).reset_index()
parliament_a = parliament_us.groupby('meta.pair_idx').nth(1).reset_index()

# Merge dfs into one df for QA-pairs
parliament_qa = pd.concat([parliament_q.reset_index(), parliament_a.drop('meta.pair_idx', 1)], axis=1)
parliament_qa = parliament_qa.drop(['index', 'meta.is_question', 'meta.is_answer'], axis=1)
parliament_qa = parliament_qa.dropna()
parliament_qa.columns = ['meta.pair_idx', 'text_q', 'id_q', 'text_a', 'id_a']
parliament_qa.head(2)

Unnamed: 0,meta.pair_idx,text_q,id_q,text_a,id_a
0,1979-05-21.0.0,asked the Secretary of State for Trade if it i...,1979-05-21a.669.13,Does my right hon Friend think that it is a co...,1979-05-21a.670.2
1,1979-05-21.1.0,"In 1978 , the United Kingdom had a visible tra...",1979-05-21a.672.3,Does my hon Friend agree that the trading defi...,1979-05-21a.673.0


In [5]:
parliament_qa.shape

(216893, 5)

## Tennis corpus

In [14]:
# Download corpus and get dataframe
tennis_corpus = Corpus(filename=download("tennis-corpus"))
tennis_us = tennis_corpus.get_utterances_dataframe()

Dataset already exists at C:\Users\polin\.convokit\downloads\tennis-corpus


In [21]:
# Get useful columns
tennis_us = tennis_us[['text', 'meta.is_question', 'meta.is_answer', 'meta.pair_idx']]
tennis_us['idx'] = tennis_us.index

# Get dfs for Qs and As
tennis_q = tennis_us.groupby('meta.pair_idx').nth(0).reset_index()
tennis_a = tennis_us.groupby('meta.pair_idx').nth(1).reset_index()

# Merge dfs into one df for QA-pairs
tennis_qa = pd.concat([tennis_q, tennis_a.drop('meta.pair_idx', 1)], axis=1)
tennis_qa = tennis_qa.drop(['meta.is_question', 'meta.is_answer'], axis=1)
tennis_qa = tennis_qa.dropna()
tennis_qa.columns = ['meta.pair_idx', 'text_q', 'id_q', 'text_a', 'id_a']
tennis_qa.head(2)

Unnamed: 0,meta.pair_idx,text_q,id_q,text_a,id_a
0,0_0,That last set seemed like a faultless performa...,0_0.q,"Yeah, I served extremely well, and then the tw...",0_0.a
1,0_1,"Did playing the semifinal, finishing that off ...",0_1.q,"No, I don't think so. You know, it was an unfo...",0_1.a


In [22]:
tennis_qa.shape

(81974, 5)

## Reddit Coarse Discourse corpus

In [30]:
# Download corpus and get dataframe
coarse_corpus = Corpus(filename=download("reddit-coarse-discourse-corpus"))
coarse_us = coarse_corpus.get_utterances_dataframe()

Dataset already exists at C:\Users\polin\.convokit\downloads\reddit-coarse-discourse-corpus


In [31]:
# Get useful columns
coarse_us = coarse_us[['text', 'meta.majority_type', 'meta.majority_link']]
coarse_us['idx'] = coarse_us.index

# Get dfs for Qs and As
coarse_q = coarse_us[coarse_us['meta.majority_type'] == 'question']
coarse_a = coarse_us[coarse_us['meta.majority_type'] == 'answer']

# Merge dfs into one df for QA-pairs
coarse_qa = pd.merge(coarse_q, coarse_a, left_on='id', right_on='meta.majority_link', how='inner')
coarse_qa = coarse_qa.drop(['meta.majority_type_x', 'meta.majority_link_x', 'meta.majority_type_y', 'meta.majority_link_y'], axis=1)
coarse_qa.columns = ['text_q', 'id_q', 'text_a', 'id_a']
coarse_qa.head(2)

Unnamed: 0,text_q,id_q,text_a,id_a
0,I can't find them anywhere! I just want the mo...,t3_16h61h,"I saw several new ones yesterday, including a ...",t1_c7w4iik
1,I can't find them anywhere! I just want the mo...,t3_16h61h,"lots of pawn shops around me, Augusta GA, have...",t1_c7w8hlf


In [32]:
coarse_qa.shape

(39749, 4)

## Wikipedia Conversations Gone Awry corpus 

In [33]:
# Download corpus and get dataframe
awry_corpus = Corpus(filename=download("conversations-gone-awry-corpus"))
awry_us = awry_corpus.get_utterances_dataframe()

Dataset already exists at C:\Users\polin\.convokit\downloads\conversations-gone-awry-corpus


In [34]:
# Drop section headers
awry_us = awry_us[awry_us['meta.is_section_header'] == False]

# Get useful columns
awry_us = awry_us[['text', 'reply_to', 'meta.comment_has_personal_attack']]
awry_us['idx'] = awry_us.index

# Get dfs for candidate As
# Hypothesis: If a comment has a personal attack and replies to a Q, it could be A avoiding the Q.
awry_a = awry_us[awry_us['meta.comment_has_personal_attack']]

# Merge dfs into one df for QA-pairs
awry_qa = pd.merge(awry_us, awry_a, left_on='id', right_on='reply_to', how='inner')
awry_qa = awry_qa.drop(['reply_to_x', 'meta.comment_has_personal_attack_x', 'reply_to_y', 'meta.comment_has_personal_attack_y'], axis=1)
awry_qa.columns = ['text_q', 'id_q', 'text_a', 'id_a']
awry_qa.head(2)

Unnamed: 0,text_q,id_q,text_a,id_a
0,Please stop removing and altering other editor...,144643838.1260.1236,Bullshit. I am correcting a simple mistake. I...,144645449.1479.1479
1,But it certainly blows holes in the argument b...,68188977.25580.25580,Get a grip Ron. First you confuse Iran and Ir...,68254097.25708.25708


In [35]:
awry_qa.shape

(1493, 4)

## Custom corpus

In [44]:
# Create samples from each dataset
parliament_sample = parliament_qa.sample(n=200, random_state=1)
tennis_sample = tennis_qa.sample(n=150, random_state=1)
coarse_sample = coarse_qa.sample(n=150, random_state=1)

# Add dataset identification
parliament_sample['dataset'] = 'PQTC'
tennis_sample['dataset'] = 'TI'
coarse_sample['dataset'] = 'CDC'

# Append data into one dataset and shuffle
dataset = parliament_sample.append(tennis_sample, ignore_index = True).append(coarse_sample, ignore_index = True)
dataset = dataset.sample(frac=1).reset_index(drop=True)

In [46]:
dataset.to_csv('question_avoidance_dataset.csv', index=False)