In [12]:
import numpy as np
import pandas as pd
import rpy2

%load_ext rpy2.ipython

ZERO = 1e-5
ONE = 1 - ZERO

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [2]:
norming = pd.read_csv("mega-intensionality-norming-v1/mega-intensionality-norming-v1.tsv", sep="\t")

In [3]:
# Add columns to norming data that appear only in validation data
# cols_not_in_norming = set(validation.columns) - set(norming.columns)
# for c in cols_not_in_norming:
#     norming[c] = None

# Add a column to indicate the task (not technically needed)
norming["task"] = "norming"

# Clip responses for the Beta regression
norming["response"] = norming.response.clip(ZERO, ONE)

norming["scenario"] = norming[["subject", "verb"]].apply(lambda r: "-".join([r[0], r[1]]), axis=1)

# Just verifying that participant numbering starts at 0 for norming data
assert norming.participant.min() == 0

norming.head()

Unnamed: 0,listid,participant,verb,subject,consequent,valence,question,response,task,scenario
0,0,96,notify,mechanic,want,negative,What proportion of mechanics generally want ca...,1e-05,norming,mechanic-notify
1,0,33,notify,mechanic,want,negative,What proportion of mechanics generally want ca...,1e-05,norming,mechanic-notify
2,0,2,notify,mechanic,want,negative,What proportion of mechanics generally want ca...,0.17,norming,mechanic-notify
3,0,59,notify,mechanic,want,negative,What proportion of mechanics generally want ca...,0.1,norming,mechanic-notify
4,0,29,notify,mechanic,want,negative,What proportion of mechanics generally want ca...,0.06,norming,mechanic-notify


In [4]:
validation = pd.read_csv("mega-intensionality-validation-v1/mega-intensionality-validation-v1.tsv", sep="\t")
validation.head()

Unnamed: 0,listid,participant,verb,subject,target,consequent,polarity,tense,valence,question,transitivity,response
0,4,210,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.9
1,4,319,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.88
2,4,7,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.97
3,4,259,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.81
4,4,308,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.9


In [5]:
# We need to treat norming and validation participants as distinct;
# Even though this is not strictly true, we have no way of knowing
# based on the data. Norming participant IDs are [0...99] and validation
# participant IDs are [0...419]. To enforce uniqueness across the
# datasets, we just add 1 + max(norming participant ID) to each participant
# ID in the validation data. We could do something similar for the list IDs,
# but we don't use them, and so they can be ignored.
max_norming_participant_id = norming.participant.max()
assert validation.participant.min() == 0
validation["participant"] = validation.participant.apply(lambda x: x + max_norming_participant_id + 1)

# Add "task" column to easily distinguish task (contentful vs. templatic validation)
# (Again, not technically needed)
validation["task"] = np.where(validation["valence"] == "neutral", "templatic", "contentful")

# Column to identify the scenario, same as for norming
# (Does not apply in the templatic case --- hence the None values for these data)
validation["scenario"] = validation[["subject", "verb"]].apply(lambda r: "-".join([r[0], r[1]]), axis=1)
validation.loc[validation.task == "templatic", "scenario"] = None

# Clip responses for the Beta regression
validation["response"] = validation.response.clip(ZERO, ONE)
validation.head()

Unnamed: 0,listid,participant,verb,subject,target,consequent,polarity,tense,valence,question,transitivity,response,task,scenario
0,4,310,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.9,contentful,executive-know
1,4,419,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.88,contentful,executive-know
2,4,107,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.97,contentful,executive-know
3,4,359,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.81,contentful,executive-know
4,4,408,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.9,contentful,executive-know


In [6]:
# Split validation into contentful and templatic dataframes
validation_c = validation[validation.task == "contentful"]
print(len(validation_c))
validation_c.head()

7680


Unnamed: 0,listid,participant,verb,subject,target,consequent,polarity,tense,valence,question,transitivity,response,task,scenario
0,4,310,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.9,contentful,executive-know
1,4,419,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.88,contentful,executive-know
2,4,107,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.97,contentful,executive-know
3,4,359,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.81,contentful,executive-know
4,4,408,know,executive,subject,want,positive,future,positive,The executive knew that his deal would go thro...,intransitive,0.9,contentful,executive-know


In [7]:
validation_t = validation[validation.task == "templatic"]
print(len(validation_t))
validation_t.head()

7680


Unnamed: 0,listid,participant,verb,subject,target,consequent,polarity,tense,valence,question,transitivity,response,task,scenario
7680,10,252,persuade,A,object,believe,negative,future,neutral,A didn't persuade B that C would happen.How li...,transitive,0.19,templatic,
7681,10,252,persuade,A,object,want,positive,future,neutral,A persuaded B that C would happen.How likely i...,transitive,0.42,templatic,
7682,10,362,persuade,A,object,believe,negative,future,neutral,A didn't persuade B that C would happen.How li...,transitive,0.26,templatic,
7683,10,362,persuade,A,object,want,positive,future,neutral,A persuaded B that C would happen.How likely i...,transitive,0.34,templatic,
7684,10,312,persuade,A,object,want,positive,future,neutral,A persuaded B that C would happen.How likely i...,transitive,0.5,templatic,


In [8]:
# Split norming, contentful, and validation dataframes based on consequent
w_c = validation_c[validation_c.consequent == "want"]
w_t = validation_t[validation_t.consequent == "want"]
w_n = norming[norming.consequent == "want"]

b_c = validation_c[validation_c.consequent == "believe"]
b_t = validation_t[validation_t.consequent == "believe"]
b_n = norming[norming.consequent == "believe"]

# Verify the number of unique participant IDs
assert w_n.participant.nunique() + w_t.participant.nunique() + w_c.participant.nunique() == 420
assert b_n.participant.nunique() + b_t.participant.nunique() + b_c.participant.nunique() == 420

# Write to output
w_c.to_csv("want_contentful.csv", index=False)
w_t.to_csv("want_templatic.csv", index=False)
w_n.to_csv("want_norming.csv", index=False)

b_c.to_csv("believe_contentful.csv", index=False)
b_t.to_csv("believe_templatic.csv", index=False)
b_n.to_csv("believe_norming.csv", index=False)