Skip to content

Commit

Permalink
adding causal amortized topic model
Browse files Browse the repository at this point in the history
  • Loading branch information
Dhanya Sridhar authored and Dhanya Sridhar committed Mar 5, 2020
1 parent 4b46faf commit 2966493
Show file tree
Hide file tree
Showing 19 changed files with 1,919 additions and 0 deletions.
61 changes: 61 additions & 0 deletions src/supervised_lda/add_split_to_simulations.ipynb
@@ -0,0 +1,61 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"base_sim_dir = '../../dat/sim/'\n",
"datasets = ['reddit_subreddit_based/subreddits[13, 6, 8]', 'peerread_buzzytitle_based']\n",
"mode = 'modesimple'\n",
"\n",
"for dataset in datasets:\n",
" simdir = os.path.join(base_sim_dir, dataset, mode)\n",
" for simfile in os.listdir(simdir):\n",
" df = pd.read_csv(os.path.join(simdir, simfile), sep='\\t')\n",
" df['split'] = np.random.randint(0, 10, size=df.shape[0])\n",
" df.to_csv(os.path.join(simdir, simfile),sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
51 changes: 51 additions & 0 deletions src/supervised_lda/compute_estimates.py
@@ -0,0 +1,51 @@
from semi_parametric_estimation.att import att_estimates
import numpy as np
import os
import argparse
import pandas as pd

def main():
outdir = os.path.join('..', 'out', args.data, args.experiment)
for sim in os.listdir(outdir):
mean_estimates = {'very_naive': [], 'q_only': [], 'plugin': [], 'one_step_tmle': [], 'aiptw': []}
for split in os.listdir(os.path.join(outdir, sim)):
if args.num_splits is not None:
# print("ignoring split", split)
if int(split) >= int(args.num_splits):
continue
array = np.load(os.path.join(outdir, sim, split, 'predictions.npz'))
g = array['g']
q0 = array['q0']
q1 = array['q1']
y = array['y']
t = array['t']
estimates = att_estimates(q0, q1, g, t, y, t.mean(), truncate_level=0.03)
for est, att in estimates.items():
mean_estimates[est].append(att)

if args.data == 'reddit':
sim = sim.replace('beta01.0.', '')
options = sim.split('.0.')
p2 = options[0].replace('beta1', '')
p3 = options[1].replace('gamma', '')

print("------ Simulation setting: Confounding strength =", p2, "; Variance:", p3, "------")
print("True effect = 1.0")
else:
ground_truth_map = {'1.0':0.06, '5.0':0.06, '25.0':0.03}
print("------ Simulation setting: Confounding strength =", sim)
print("True effect = ", ground_truth_map[sim])


for est, atts in mean_estimates.items():
print('\t', est, np.round(np.mean(atts), 3), "+/-", np.round(np.std(atts),3))


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--data", action="store", default="reddit")
parser.add_argument("--experiment", action="store", default="base_model")
parser.add_argument("--num-splits", action="store", default=None)
args = parser.parse_args()

main()
87 changes: 87 additions & 0 deletions src/supervised_lda/helpers.py
@@ -0,0 +1,87 @@
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation

class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, articles):
stop = stopwords.words('english')
return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop]

def filter_by_subreddit(reddit, subs=None):
if not subs:
return reddit.index.values
else:
return reddit[reddit.subreddit.isin(subs)].index.values

def tokenize_documents(documents,max_df0=0.9, min_df0=0.0005):
from nltk.corpus import stopwords
'''
From a list of documents raw text build a matrix DxV
D: number of docs
V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
'''
count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
corpus = count_vect.fit_transform(documents)
vocabulary = count_vect.get_feature_names()

return corpus,vocabulary,count_vect

def assign_dev_split(num_docs, percentage=0.05):
indices = np.arange(num_docs)
np.random.shuffle(indices)
size = int(indices.shape[0]*percentage)
dev = indices[:size]
return dev

def learn_topics(X, X_dev, K=50):
lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
print("Fitting", K, "topics...")
lda.fit(X)
score = lda.perplexity(X_dev)
print("Log likelihood:", score)
topics = lda.components_
return score, lda, topics

def show_topics(vocab, topics, n_words=20):
topic_keywords = []
for topic_weights in topics:
top_keyword_locs = (-topic_weights).argsort()[:n_words]
topic_keywords.append(vocab.take(top_keyword_locs))

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
return df_topic_keywords

def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'):
filtered_indices = filtered_df[on].values
doc_idx = [index_mapping[idx] for idx in filtered_indices]
embeddings = doc_embeddings[doc_idx, :]
return embeddings

def filter_document_terms(filtered_df, counts, index_mapping, on='post_index'):
filtered_indices = filtered_df[on].values
doc_idx = [index_mapping[idx] for idx in filtered_indices]
filtered_counts = counts[doc_idx, :]
return filtered_counts

def make_index_mapping(df, on='post_index', convert_to_int=True):
if on=='index':
indices = df.index.values
else:
indices = df[on].values

if convert_to_int:
return {int(ind):i for (i,ind) in enumerate(indices)}

return {ind:i for (i,ind) in enumerate(indices)}

def assign_split(df, num_splits=10, col_to_add='split'):
df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0])
return df
133 changes: 133 additions & 0 deletions src/supervised_lda/peerread_output_att.py
@@ -0,0 +1,133 @@
from semi_parametric_estimation.att import att_estimates
from supervised_lda.helpers import filter_document_terms, make_index_mapping, assign_split, tokenize_documents
import numpy as np
import pandas as pd
import os
from sklearn.metrics import mean_squared_error as mse
import argparse
import sys
from supervised_lda.supervised_topic_model import SupervisedTopicModel
from supervised_lda import run_supervised_tm
from scipy import sparse
from sklearn.linear_model import LogisticRegression, Ridge
from scipy.special import logit

def load_peerread(path='../dat/PeerRead/'):
return pd.read_csv(path + 'proc_abstracts.csv')

def load_term_counts(df, path='../dat/PeerRead/', force_redo=False, text_col='abstract_text'):
count_filename = path + 'term_counts'
vocab_filename = path + 'vocab'

if os.path.exists(count_filename + '.npz') and not force_redo:
return sparse.load_npz(count_filename + '.npz').toarray(), np.load(vocab_filename + '.npy')

post_docs = df[text_col].values
counts, vocab, _ = tokenize_documents(post_docs)
sparse.save_npz(count_filename, counts)
np.save(vocab_filename, vocab)
return counts.toarray(), np.array(vocab)

def compute_ground_truth_treatment_effect(df):
y1 = df['y1']
y0 = df['y0']
return y1.mean() - y0.mean()

def load_simulated_data():
sim_df = pd.read_csv(simulation_file, delimiter='\t')
return sim_df

def fit_model(doc_embeddings, labels, is_binary=False):
if is_binary:
model = LogisticRegression(solver='liblinear')
else:
model = Ridge()
model.fit(doc_embeddings, labels)
return model

def main():
if dat_dir:
peerread = load_peerread(path=dat_dir)
counts,vocab = load_term_counts(peerread,path=dat_dir)
else:
peerread = load_peerread()
counts,vocab = load_term_counts(peerread)

indices = peerread['paper_id'].values
index_mapping = make_index_mapping(peerread, on='index')

sim_df = load_simulated_data()

train_df = sim_df[sim_df.split != split]
predict_df = sim_df[sim_df.split == split]
tr_treatment_labels = train_df.treatment.values
tr_outcomes = train_df.outcome.values
predict_treatment = predict_df.treatment.values
predict_outcomes = predict_df.outcome.values

tr_counts = filter_document_terms(train_df, counts, index_mapping, on='id')
predict_counts = filter_document_terms(predict_df, counts, index_mapping, on='id')

num_documents = tr_counts.shape[0]
vocab_size = tr_counts.shape[1]
model = SupervisedTopicModel(num_topics, vocab_size, num_documents, outcome_linear_map=linear_outcome_model)

run_supervised_tm.train(model, tr_counts, tr_treatment_labels, tr_outcomes, dtype='binary',
num_epochs=num_iters, use_recon_loss=use_recon_loss, use_sup_loss=use_supervised_loss)

if use_supervised_loss:
propensity_score, expected_outcome_treat, expected_outcome_no_treat = run_supervised_tm.predict(model, predict_counts, dtype='binary')
else:
tr_doc_embeddings = run_supervised_tm.get_representation(model, tr_counts)
treated = tr_treatment_labels == 1
out_treat = tr_outcomes[treated]
out_no_treat = tr_outcomes[~treated]
q0_embeddings = tr_doc_embeddings[~treated,:]
q1_embeddings = tr_doc_embeddings[treated,:]
q0_model = fit_model(q0_embeddings, out_no_treat, is_binary=True)
q1_model = fit_model(q1_embeddings, out_treat, is_binary=True)
g_model = fit_model(tr_doc_embeddings, tr_treatment_labels, is_binary=True)

pred_doc_embeddings = run_supervised_tm.get_representation(model, predict_counts)
propensity_score = g_model.predict_proba(pred_doc_embeddings)[:,1]
expected_outcome_no_treat = q0_model.predict_proba(pred_doc_embeddings)[:,1]
expected_outcome_treat = q1_model.predict_proba(pred_doc_embeddings)[:,1]

out = os.path.join(outdir, str(split))
os.makedirs(out, exist_ok=True)
outfile = os.path.join(out, 'predictions')
np.savez_compressed(outfile, g=propensity_score, q0=expected_outcome_no_treat, q1=expected_outcome_treat, t=predict_treatment, y=predict_outcomes)

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--dat-dir", action="store", default=None)
parser.add_argument("--outdir", action="store", default='../out/')
parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/')
parser.add_argument("--mode", action="store", default="simple")
parser.add_argument("--params", action="store", default="1.0")
parser.add_argument("--verbose", action='store_true')
parser.add_argument("--split", action='store', default=0)
parser.add_argument("--num-iters", action="store", default=3000)
parser.add_argument("--num-topics", action='store', default=100)
parser.add_argument("--linear-outcome-model", action='store', default="t")
parser.add_argument("--use-recon-loss", action='store', default="t")
parser.add_argument("--use-supervised-loss", action='store', default="t")
args = parser.parse_args()

sim_dir = args.sim_dir
outdir = args.outdir
dat_dir = args.dat_dir
verbose = args.verbose
params = args.params
sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0'
mode = args.mode
simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv"
num_topics = args.num_topics
split = int(args.split)
linear_outcome_model = True if args.linear_outcome_model == "t" else False
use_supervised_loss = True if args.use_supervised_loss == "t" else False
use_recon_loss = True if args.use_recon_loss == "t" else False
num_iters = int(args.num_iters)
print(use_supervised_loss, use_recon_loss, linear_outcome_model)

main()

0 comments on commit 2966493

Please sign in to comment.