adding causal amortized topic model

blei-lab · Mar 5, 2020 · 2966493 · 2966493
1 parent 4b46faf
commit 2966493
Show file tree

Hide file tree

Showing 19 changed files with 1,919 additions and 0 deletions.
diff --git a/src/supervised_lda/add_split_to_simulations.ipynb b/src/supervised_lda/add_split_to_simulations.ipynb
@@ -0,0 +1,61 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_sim_dir = '../../dat/sim/'\n",
+    "datasets = ['reddit_subreddit_based/subreddits[13, 6, 8]', 'peerread_buzzytitle_based']\n",
+    "mode = 'modesimple'\n",
+    "\n",
+    "for dataset in datasets:\n",
+    "    simdir = os.path.join(base_sim_dir, dataset, mode)\n",
+    "    for simfile in os.listdir(simdir):\n",
+    "        df = pd.read_csv(os.path.join(simdir, simfile), sep='\\t')\n",
+    "        df['split'] = np.random.randint(0, 10, size=df.shape[0])\n",
+    "        df.to_csv(os.path.join(simdir, simfile),sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/supervised_lda/compute_estimates.py b/src/supervised_lda/compute_estimates.py
@@ -0,0 +1,51 @@
+from semi_parametric_estimation.att import att_estimates
+import numpy as np
+import os
+import argparse
+import pandas as pd
+
+def main():
+	outdir = os.path.join('..', 'out', args.data, args.experiment)
+	for sim in os.listdir(outdir):
+		mean_estimates = {'very_naive': [], 'q_only': [], 'plugin': [], 'one_step_tmle': [], 'aiptw': []}
+		for split in os.listdir(os.path.join(outdir, sim)):
+			if args.num_splits is not None:
+				# print("ignoring split", split)
+				if int(split) >= int(args.num_splits):
+					continue
+			array = np.load(os.path.join(outdir, sim, split, 'predictions.npz'))
+			g = array['g']
+			q0 = array['q0']
+			q1 = array['q1']
+			y = array['y']
+			t = array['t']
+			estimates = att_estimates(q0, q1, g, t, y, t.mean(), truncate_level=0.03)
+			for est, att in estimates.items():
+				mean_estimates[est].append(att)
+
+		if args.data == 'reddit':
+			sim = sim.replace('beta01.0.', '')
+			options = sim.split('.0.')
+			p2 = options[0].replace('beta1', '')
+			p3 = options[1].replace('gamma', '')
+
+			print("------ Simulation setting: Confounding strength =", p2, "; Variance:", p3, "------")
+			print("True effect = 1.0")
+		else:
+			ground_truth_map = {'1.0':0.06, '5.0':0.06, '25.0':0.03}
+			print("------ Simulation setting: Confounding strength =", sim)
+			print("True effect = ", ground_truth_map[sim])
+
+
+		for est, atts in mean_estimates.items():
+			print('\t', est, np.round(np.mean(atts), 3), "+/-", np.round(np.std(atts),3))
+
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser()
+	parser.add_argument("--data", action="store", default="reddit")
+	parser.add_argument("--experiment", action="store", default="base_model")
+	parser.add_argument("--num-splits", action="store", default=None)
+	args = parser.parse_args()
+
+	main()
diff --git a/src/supervised_lda/helpers.py b/src/supervised_lda/helpers.py
@@ -0,0 +1,87 @@
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import CountVectorizer
+import numpy as np
+import pandas as pd
+from sklearn.decomposition import LatentDirichletAllocation
+
+class LemmaTokenizer(object):
+	def __init__(self):
+		self.wnl = WordNetLemmatizer()
+	def __call__(self, articles):
+		stop = stopwords.words('english')
+		return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop]
+
+def filter_by_subreddit(reddit, subs=None):
+	if not subs:
+		return reddit.index.values
+	else:
+		return reddit[reddit.subreddit.isin(subs)].index.values
+
+def tokenize_documents(documents,max_df0=0.9, min_df0=0.0005):
+	from nltk.corpus import stopwords
+	'''
+	From a list of documents raw text build a matrix DxV
+	D: number of docs
+	V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs
+	'''
+	count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0)
+	corpus = count_vect.fit_transform(documents)
+	vocabulary = count_vect.get_feature_names()
+
+	return corpus,vocabulary,count_vect
+
+def assign_dev_split(num_docs, percentage=0.05):
+	indices = np.arange(num_docs)
+	np.random.shuffle(indices)
+	size = int(indices.shape[0]*percentage)
+	dev = indices[:size]
+	return dev
+
+def learn_topics(X, X_dev, K=50):
+	lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
+	print("Fitting", K, "topics...")
+	lda.fit(X)
+	score = lda.perplexity(X_dev)
+	print("Log likelihood:", score)
+	topics = lda.components_
+	return score, lda, topics
+
+def show_topics(vocab, topics, n_words=20):
+	topic_keywords = []
+	for topic_weights in topics:
+		top_keyword_locs = (-topic_weights).argsort()[:n_words]
+		topic_keywords.append(vocab.take(top_keyword_locs))
+
+	df_topic_keywords = pd.DataFrame(topic_keywords)
+	df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
+	df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
+	return df_topic_keywords
+
+def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'):
+	filtered_indices = filtered_df[on].values
+	doc_idx = [index_mapping[idx] for idx in filtered_indices]
+	embeddings = doc_embeddings[doc_idx, :]
+	return embeddings
+
+def filter_document_terms(filtered_df, counts, index_mapping, on='post_index'):
+	filtered_indices = filtered_df[on].values
+	doc_idx = [index_mapping[idx] for idx in filtered_indices]
+	filtered_counts = counts[doc_idx, :]
+	return filtered_counts
+
+def make_index_mapping(df, on='post_index', convert_to_int=True):
+	if on=='index':
+		indices = df.index.values
+	else:
+		indices = df[on].values
+
+	if convert_to_int:
+		return {int(ind):i for (i,ind) in enumerate(indices)}
+
+	return {ind:i for (i,ind) in enumerate(indices)}
+
+def assign_split(df, num_splits=10, col_to_add='split'):
+	df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0])
+	return df
diff --git a/src/supervised_lda/peerread_output_att.py b/src/supervised_lda/peerread_output_att.py
@@ -0,0 +1,133 @@
+from semi_parametric_estimation.att import att_estimates
+from supervised_lda.helpers import filter_document_terms, make_index_mapping, assign_split, tokenize_documents
+import numpy as np
+import pandas as pd
+import os
+from sklearn.metrics import mean_squared_error as mse
+import argparse
+import sys
+from supervised_lda.supervised_topic_model import SupervisedTopicModel
+from supervised_lda import run_supervised_tm
+from scipy import sparse
+from sklearn.linear_model import LogisticRegression, Ridge
+from scipy.special import logit
+
+def load_peerread(path='../dat/PeerRead/'):
+	return pd.read_csv(path + 'proc_abstracts.csv')
+
+def load_term_counts(df, path='../dat/PeerRead/', force_redo=False, text_col='abstract_text'):
+	count_filename = path  + 'term_counts'
+	vocab_filename = path + 'vocab'
+
+	if os.path.exists(count_filename + '.npz') and not force_redo:
+		return sparse.load_npz(count_filename + '.npz').toarray(), np.load(vocab_filename + '.npy')
+
+	post_docs = df[text_col].values
+	counts, vocab, _ = tokenize_documents(post_docs)    
+	sparse.save_npz(count_filename, counts)
+	np.save(vocab_filename, vocab)
+	return counts.toarray(), np.array(vocab)
+
+def compute_ground_truth_treatment_effect(df):
+	y1 = df['y1']
+	y0 = df['y0']
+	return y1.mean() - y0.mean()
+
+def load_simulated_data():
+	sim_df = pd.read_csv(simulation_file, delimiter='\t')
+	return sim_df
+
+def fit_model(doc_embeddings, labels, is_binary=False):
+	if is_binary:
+		model = LogisticRegression(solver='liblinear')
+	else:
+		model = Ridge()
+	model.fit(doc_embeddings, labels)
+	return model
+
+def main():
+	if dat_dir:
+		peerread = load_peerread(path=dat_dir)
+		counts,vocab = load_term_counts(peerread,path=dat_dir)
+	else:
+		peerread = load_peerread()
+		counts,vocab = load_term_counts(peerread)
+
+	indices = peerread['paper_id'].values
+	index_mapping = make_index_mapping(peerread, on='index')
+
+	sim_df = load_simulated_data()
+
+	train_df = sim_df[sim_df.split != split]
+	predict_df = sim_df[sim_df.split == split]
+	tr_treatment_labels = train_df.treatment.values
+	tr_outcomes = train_df.outcome.values
+	predict_treatment = predict_df.treatment.values
+	predict_outcomes = predict_df.outcome.values
+
+	tr_counts = filter_document_terms(train_df, counts, index_mapping, on='id')
+	predict_counts = filter_document_terms(predict_df, counts, index_mapping, on='id')
+
+	num_documents = tr_counts.shape[0]
+	vocab_size = tr_counts.shape[1]
+	model = SupervisedTopicModel(num_topics, vocab_size, num_documents, outcome_linear_map=linear_outcome_model)
+
+	run_supervised_tm.train(model, tr_counts, tr_treatment_labels, tr_outcomes, dtype='binary', 
+		num_epochs=num_iters, use_recon_loss=use_recon_loss, use_sup_loss=use_supervised_loss)
+
+	if use_supervised_loss:
+		propensity_score, expected_outcome_treat, expected_outcome_no_treat = run_supervised_tm.predict(model, predict_counts, dtype='binary')
+	else:
+		tr_doc_embeddings = run_supervised_tm.get_representation(model, tr_counts)
+		treated = tr_treatment_labels == 1
+		out_treat = tr_outcomes[treated]
+		out_no_treat = tr_outcomes[~treated]
+		q0_embeddings = tr_doc_embeddings[~treated,:]
+		q1_embeddings = tr_doc_embeddings[treated,:]
+		q0_model = fit_model(q0_embeddings, out_no_treat, is_binary=True)
+		q1_model = fit_model(q1_embeddings, out_treat, is_binary=True)
+		g_model = fit_model(tr_doc_embeddings, tr_treatment_labels, is_binary=True)
+
+		pred_doc_embeddings = run_supervised_tm.get_representation(model, predict_counts)
+		propensity_score = g_model.predict_proba(pred_doc_embeddings)[:,1]
+		expected_outcome_no_treat = q0_model.predict_proba(pred_doc_embeddings)[:,1]
+		expected_outcome_treat = q1_model.predict_proba(pred_doc_embeddings)[:,1]
+
+	out = os.path.join(outdir, str(split))
+	os.makedirs(out, exist_ok=True)
+	outfile = os.path.join(out, 'predictions')
+	np.savez_compressed(outfile, g=propensity_score, q0=expected_outcome_no_treat, q1=expected_outcome_treat, t=predict_treatment, y=predict_outcomes)
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser()
+	parser.add_argument("--dat-dir", action="store", default=None)
+	parser.add_argument("--outdir", action="store", default='../out/')
+	parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/')
+	parser.add_argument("--mode", action="store", default="simple")
+	parser.add_argument("--params", action="store", default="1.0")
+	parser.add_argument("--verbose", action='store_true')
+	parser.add_argument("--split", action='store', default=0)
+	parser.add_argument("--num-iters", action="store", default=3000)
+	parser.add_argument("--num-topics", action='store', default=100)
+	parser.add_argument("--linear-outcome-model", action='store', default="t")
+	parser.add_argument("--use-recon-loss", action='store', default="t")
+	parser.add_argument("--use-supervised-loss", action='store', default="t")
+	args = parser.parse_args()
+
+	sim_dir = args.sim_dir
+	outdir = args.outdir
+	dat_dir = args.dat_dir
+	verbose = args.verbose
+	params = args.params
+	sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0'
+	mode = args.mode
+	simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv"
+	num_topics = args.num_topics
+	split = int(args.split)
+	linear_outcome_model = True if args.linear_outcome_model == "t" else False 
+	use_supervised_loss = True if args.use_supervised_loss == "t" else False
+	use_recon_loss = True if args.use_recon_loss == "t" else False
+	num_iters = int(args.num_iters)
+	print(use_supervised_loss, use_recon_loss, linear_outcome_model)
+
+	main()