Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Dhanya Sridhar
authored and
Dhanya Sridhar
committed
Mar 5, 2020
1 parent
4b46faf
commit 2966493
Showing
19 changed files
with
1,919 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import numpy as np\n", | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"base_sim_dir = '../../dat/sim/'\n", | ||
"datasets = ['reddit_subreddit_based/subreddits[13, 6, 8]', 'peerread_buzzytitle_based']\n", | ||
"mode = 'modesimple'\n", | ||
"\n", | ||
"for dataset in datasets:\n", | ||
" simdir = os.path.join(base_sim_dir, dataset, mode)\n", | ||
" for simfile in os.listdir(simdir):\n", | ||
" df = pd.read_csv(os.path.join(simdir, simfile), sep='\\t')\n", | ||
" df['split'] = np.random.randint(0, 10, size=df.shape[0])\n", | ||
" df.to_csv(os.path.join(simdir, simfile),sep='\\t')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.8" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
from semi_parametric_estimation.att import att_estimates | ||
import numpy as np | ||
import os | ||
import argparse | ||
import pandas as pd | ||
|
||
def main(): | ||
outdir = os.path.join('..', 'out', args.data, args.experiment) | ||
for sim in os.listdir(outdir): | ||
mean_estimates = {'very_naive': [], 'q_only': [], 'plugin': [], 'one_step_tmle': [], 'aiptw': []} | ||
for split in os.listdir(os.path.join(outdir, sim)): | ||
if args.num_splits is not None: | ||
# print("ignoring split", split) | ||
if int(split) >= int(args.num_splits): | ||
continue | ||
array = np.load(os.path.join(outdir, sim, split, 'predictions.npz')) | ||
g = array['g'] | ||
q0 = array['q0'] | ||
q1 = array['q1'] | ||
y = array['y'] | ||
t = array['t'] | ||
estimates = att_estimates(q0, q1, g, t, y, t.mean(), truncate_level=0.03) | ||
for est, att in estimates.items(): | ||
mean_estimates[est].append(att) | ||
|
||
if args.data == 'reddit': | ||
sim = sim.replace('beta01.0.', '') | ||
options = sim.split('.0.') | ||
p2 = options[0].replace('beta1', '') | ||
p3 = options[1].replace('gamma', '') | ||
|
||
print("------ Simulation setting: Confounding strength =", p2, "; Variance:", p3, "------") | ||
print("True effect = 1.0") | ||
else: | ||
ground_truth_map = {'1.0':0.06, '5.0':0.06, '25.0':0.03} | ||
print("------ Simulation setting: Confounding strength =", sim) | ||
print("True effect = ", ground_truth_map[sim]) | ||
|
||
|
||
for est, atts in mean_estimates.items(): | ||
print('\t', est, np.round(np.mean(atts), 3), "+/-", np.round(np.std(atts),3)) | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--data", action="store", default="reddit") | ||
parser.add_argument("--experiment", action="store", default="base_model") | ||
parser.add_argument("--num-splits", action="store", default=None) | ||
args = parser.parse_args() | ||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
from nltk.tokenize import word_tokenize | ||
from nltk.stem import WordNetLemmatizer | ||
from nltk.corpus import stopwords | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.decomposition import LatentDirichletAllocation | ||
|
||
class LemmaTokenizer(object): | ||
def __init__(self): | ||
self.wnl = WordNetLemmatizer() | ||
def __call__(self, articles): | ||
stop = stopwords.words('english') | ||
return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if t.isalpha() and t not in stop] | ||
|
||
def filter_by_subreddit(reddit, subs=None): | ||
if not subs: | ||
return reddit.index.values | ||
else: | ||
return reddit[reddit.subreddit.isin(subs)].index.values | ||
|
||
def tokenize_documents(documents,max_df0=0.9, min_df0=0.0005): | ||
from nltk.corpus import stopwords | ||
''' | ||
From a list of documents raw text build a matrix DxV | ||
D: number of docs | ||
V: size of the vocabulary, i.e. number of unique terms found in the whole set of docs | ||
''' | ||
count_vect = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=max_df0, min_df=min_df0) | ||
corpus = count_vect.fit_transform(documents) | ||
vocabulary = count_vect.get_feature_names() | ||
|
||
return corpus,vocabulary,count_vect | ||
|
||
def assign_dev_split(num_docs, percentage=0.05): | ||
indices = np.arange(num_docs) | ||
np.random.shuffle(indices) | ||
size = int(indices.shape[0]*percentage) | ||
dev = indices[:size] | ||
return dev | ||
|
||
def learn_topics(X, X_dev, K=50): | ||
lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1) | ||
print("Fitting", K, "topics...") | ||
lda.fit(X) | ||
score = lda.perplexity(X_dev) | ||
print("Log likelihood:", score) | ||
topics = lda.components_ | ||
return score, lda, topics | ||
|
||
def show_topics(vocab, topics, n_words=20): | ||
topic_keywords = [] | ||
for topic_weights in topics: | ||
top_keyword_locs = (-topic_weights).argsort()[:n_words] | ||
topic_keywords.append(vocab.take(top_keyword_locs)) | ||
|
||
df_topic_keywords = pd.DataFrame(topic_keywords) | ||
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])] | ||
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])] | ||
return df_topic_keywords | ||
|
||
def filter_document_embeddings(filtered_df, doc_embeddings, index_mapping, on='post_index'): | ||
filtered_indices = filtered_df[on].values | ||
doc_idx = [index_mapping[idx] for idx in filtered_indices] | ||
embeddings = doc_embeddings[doc_idx, :] | ||
return embeddings | ||
|
||
def filter_document_terms(filtered_df, counts, index_mapping, on='post_index'): | ||
filtered_indices = filtered_df[on].values | ||
doc_idx = [index_mapping[idx] for idx in filtered_indices] | ||
filtered_counts = counts[doc_idx, :] | ||
return filtered_counts | ||
|
||
def make_index_mapping(df, on='post_index', convert_to_int=True): | ||
if on=='index': | ||
indices = df.index.values | ||
else: | ||
indices = df[on].values | ||
|
||
if convert_to_int: | ||
return {int(ind):i for (i,ind) in enumerate(indices)} | ||
|
||
return {ind:i for (i,ind) in enumerate(indices)} | ||
|
||
def assign_split(df, num_splits=10, col_to_add='split'): | ||
df[col_to_add] = np.random.randint(0, num_splits, size=df.shape[0]) | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
from semi_parametric_estimation.att import att_estimates | ||
from supervised_lda.helpers import filter_document_terms, make_index_mapping, assign_split, tokenize_documents | ||
import numpy as np | ||
import pandas as pd | ||
import os | ||
from sklearn.metrics import mean_squared_error as mse | ||
import argparse | ||
import sys | ||
from supervised_lda.supervised_topic_model import SupervisedTopicModel | ||
from supervised_lda import run_supervised_tm | ||
from scipy import sparse | ||
from sklearn.linear_model import LogisticRegression, Ridge | ||
from scipy.special import logit | ||
|
||
def load_peerread(path='../dat/PeerRead/'): | ||
return pd.read_csv(path + 'proc_abstracts.csv') | ||
|
||
def load_term_counts(df, path='../dat/PeerRead/', force_redo=False, text_col='abstract_text'): | ||
count_filename = path + 'term_counts' | ||
vocab_filename = path + 'vocab' | ||
|
||
if os.path.exists(count_filename + '.npz') and not force_redo: | ||
return sparse.load_npz(count_filename + '.npz').toarray(), np.load(vocab_filename + '.npy') | ||
|
||
post_docs = df[text_col].values | ||
counts, vocab, _ = tokenize_documents(post_docs) | ||
sparse.save_npz(count_filename, counts) | ||
np.save(vocab_filename, vocab) | ||
return counts.toarray(), np.array(vocab) | ||
|
||
def compute_ground_truth_treatment_effect(df): | ||
y1 = df['y1'] | ||
y0 = df['y0'] | ||
return y1.mean() - y0.mean() | ||
|
||
def load_simulated_data(): | ||
sim_df = pd.read_csv(simulation_file, delimiter='\t') | ||
return sim_df | ||
|
||
def fit_model(doc_embeddings, labels, is_binary=False): | ||
if is_binary: | ||
model = LogisticRegression(solver='liblinear') | ||
else: | ||
model = Ridge() | ||
model.fit(doc_embeddings, labels) | ||
return model | ||
|
||
def main(): | ||
if dat_dir: | ||
peerread = load_peerread(path=dat_dir) | ||
counts,vocab = load_term_counts(peerread,path=dat_dir) | ||
else: | ||
peerread = load_peerread() | ||
counts,vocab = load_term_counts(peerread) | ||
|
||
indices = peerread['paper_id'].values | ||
index_mapping = make_index_mapping(peerread, on='index') | ||
|
||
sim_df = load_simulated_data() | ||
|
||
train_df = sim_df[sim_df.split != split] | ||
predict_df = sim_df[sim_df.split == split] | ||
tr_treatment_labels = train_df.treatment.values | ||
tr_outcomes = train_df.outcome.values | ||
predict_treatment = predict_df.treatment.values | ||
predict_outcomes = predict_df.outcome.values | ||
|
||
tr_counts = filter_document_terms(train_df, counts, index_mapping, on='id') | ||
predict_counts = filter_document_terms(predict_df, counts, index_mapping, on='id') | ||
|
||
num_documents = tr_counts.shape[0] | ||
vocab_size = tr_counts.shape[1] | ||
model = SupervisedTopicModel(num_topics, vocab_size, num_documents, outcome_linear_map=linear_outcome_model) | ||
|
||
run_supervised_tm.train(model, tr_counts, tr_treatment_labels, tr_outcomes, dtype='binary', | ||
num_epochs=num_iters, use_recon_loss=use_recon_loss, use_sup_loss=use_supervised_loss) | ||
|
||
if use_supervised_loss: | ||
propensity_score, expected_outcome_treat, expected_outcome_no_treat = run_supervised_tm.predict(model, predict_counts, dtype='binary') | ||
else: | ||
tr_doc_embeddings = run_supervised_tm.get_representation(model, tr_counts) | ||
treated = tr_treatment_labels == 1 | ||
out_treat = tr_outcomes[treated] | ||
out_no_treat = tr_outcomes[~treated] | ||
q0_embeddings = tr_doc_embeddings[~treated,:] | ||
q1_embeddings = tr_doc_embeddings[treated,:] | ||
q0_model = fit_model(q0_embeddings, out_no_treat, is_binary=True) | ||
q1_model = fit_model(q1_embeddings, out_treat, is_binary=True) | ||
g_model = fit_model(tr_doc_embeddings, tr_treatment_labels, is_binary=True) | ||
|
||
pred_doc_embeddings = run_supervised_tm.get_representation(model, predict_counts) | ||
propensity_score = g_model.predict_proba(pred_doc_embeddings)[:,1] | ||
expected_outcome_no_treat = q0_model.predict_proba(pred_doc_embeddings)[:,1] | ||
expected_outcome_treat = q1_model.predict_proba(pred_doc_embeddings)[:,1] | ||
|
||
out = os.path.join(outdir, str(split)) | ||
os.makedirs(out, exist_ok=True) | ||
outfile = os.path.join(out, 'predictions') | ||
np.savez_compressed(outfile, g=propensity_score, q0=expected_outcome_no_treat, q1=expected_outcome_treat, t=predict_treatment, y=predict_outcomes) | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--dat-dir", action="store", default=None) | ||
parser.add_argument("--outdir", action="store", default='../out/') | ||
parser.add_argument("--sim-dir", action="store", default='../dat/sim/peerread_buzzytitle_based/') | ||
parser.add_argument("--mode", action="store", default="simple") | ||
parser.add_argument("--params", action="store", default="1.0") | ||
parser.add_argument("--verbose", action='store_true') | ||
parser.add_argument("--split", action='store', default=0) | ||
parser.add_argument("--num-iters", action="store", default=3000) | ||
parser.add_argument("--num-topics", action='store', default=100) | ||
parser.add_argument("--linear-outcome-model", action='store', default="t") | ||
parser.add_argument("--use-recon-loss", action='store', default="t") | ||
parser.add_argument("--use-supervised-loss", action='store', default="t") | ||
args = parser.parse_args() | ||
|
||
sim_dir = args.sim_dir | ||
outdir = args.outdir | ||
dat_dir = args.dat_dir | ||
verbose = args.verbose | ||
params = args.params | ||
sim_setting = 'beta00.25' + '.beta1' + params + '.gamma0.0' | ||
mode = args.mode | ||
simulation_file = sim_dir + '/mode' + mode + '/' + sim_setting + ".tsv" | ||
num_topics = args.num_topics | ||
split = int(args.split) | ||
linear_outcome_model = True if args.linear_outcome_model == "t" else False | ||
use_supervised_loss = True if args.use_supervised_loss == "t" else False | ||
use_recon_loss = True if args.use_recon_loss == "t" else False | ||
num_iters = int(args.num_iters) | ||
print(use_supervised_loss, use_recon_loss, linear_outcome_model) | ||
|
||
main() |
Oops, something went wrong.