In [None]:
#clone scholar repo, set up new environment, install packages
git clone git@github.com:dallascard/scholar.git

Open file

In [None]:
import os
import json
with open(os.path.join( 'df_train.json')) as f:
    lines = f.readlines()
first_doc = json.loads(lines[0])
for key, value in first_doc.items():
    print(key, ':', value)

Preprocessing
--min doc count: minimum times a word has to appear across documents to be included
-- label: metadata, see in paper
-- keep alphanum True: keep words with numbers and letters mixed in it
-- test ft_test.json : precprocessing the test and train sets simulatiosuly

In [None]:
script = 'preprocess_data.py'
args = 'df_train.json our_data --min-doc-count 4 --label type, star_rating, brand, usefulness --keep-alphanum True --strip-html True --test df_test.json'
print("python", script, args)
preprocess_data.main(args.split())

We can look at the vocab

In [None]:
# load the vocabualry
with open(os.path.join( 'our_data', 'train.vocab.json')) as f:
    vocab = json.load(f)
print("First few words in the vocbulary:")
print(vocab[:20] + ['...'])

Running model without meta-data

In [None]:
import run_scholar
script = 'run_scholar.py'
args = 'our_data/ -k 9 --epochs 100 --dev-folds 10 --seed 70'
print("python", script, args)
model = run_scholar.main(args.split())

Check out most common words and their background frequencies



In [None]:
import json
import numpy as np

# load the background log-frequencies
bg = np.load('output/bg.npz')['bg']

# load the vocabualry
with open('output/vocab.json') as f:
    vocab = json.load(f)

# sort terms by log-frequency
order = np.argsort(bg)

# print the most common words 
for i in range(1, 25):
    index = order[-i]
    print(vocab[index], np.exp(bg[index]))

Look at topics

In [None]:
from run_scholar import print_top_words

# load the stored (K x V) topic matrix (stored in a compressed numpy format)
beta = np.load(os.path.join('output', 'beta.npz'))['beta']
print_top_words(beta, vocab, n_pos=15, n_neg=5);

Run model with metadata

In [None]:
import run_scholar
script = 'run_scholar.py'
args = 'our_data/ -k 9 --epochs 100 --dev-folds 10 --seed 42 --labels type --topic-covars star_rating --interaction star_rating --test-prefix test'
print("python", script, args)
run_scholar.main(args.split())

Print words

In [None]:
beta = np.load('output/beta.npz')['beta']
print_top_words(beta, vocab, n_pos=15, n_neg=5);

Vectors learned for each covariate level (star rating)



In [None]:
topic_covars = np.load(os.path.join('output', 'beta_c.npz'))
weights = topic_covars['beta']
names = topic_covars['names']
print_top_words(weights, vocab, topic_names=names, n_pos=10, n_neg=5);

Visualize things, forst topic distribtutions in documents 

In [None]:

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# load the matrix with topic proportions for each document (note that this excludes those in the dev set).
npz = np.load(os.path.join('output', 'theta.train.npz')) 
ids = npz['ids']
theta = npz['theta']
n_docs, n_topics = theta.shape

index = 45
# plot the proportion of each topic in the first document
fig, ax = plt.subplots()
ax.bar(range(n_topics), theta[index, :])

# find the original line corresponding to this document, and display the text
print(ids[index])
for line in lines:
    doc = json.loads(line)
    if doc['id'] == ids[index]:
        print(doc['text'])
        break

Visualize topics over variables, starting with types


In [None]:
import pandas as pd
# load the type variable for all the documents
types_df = pd.read_csv(os.path.join( 'our_data', 'train.type.csv'), header=0, index_col=0)
types = types_df.columns

# pull out a subset corresponding to the ids from above
train_subset = types_df.loc[ids]
n_docs, n_types = train_subset.shape

# plot the average type-topic proportions
fig, ax = plt.subplots()
lefts = np.zeros(n_types)
for k in range(n_topics):
    vals = []
    for typer in types:
        vals.append(np.mean(theta[train_subset[typer] == 1, k]))

    ax.barh(range(n_types), vals, left=lefts)
    lefts += np.array(vals)
    
ax.set_yticks(range(n_types))
ax.set_yticklabels(types)
plt.show();

Plot topics across star rating



In [None]:
# load the star rating variable for all the documents
stars_df = pd.read_csv(os.path.join( 'our_data', 'train.star_rating.csv'), header=0, index_col=0)
stars = stars_df.columns

# pull out a subset corresponding to the ids from above
train_subset = stars_df.loc[ids]
n_docs, n_stars = train_subset.shape

# plot the average star rating-topic proportions
fig, ax = plt.subplots()
lefts = np.zeros(n_stars)
for k in range(n_topics):
    vals = []
    for star in stars:
        vals.append(np.mean(theta[train_subset[star] == 1, k]))

    ax.barh(range(n_stars), vals, left=lefts)
    lefts += np.array(vals)
    
ax.set_yticks(range(n_stars))
ax.set_yticklabels(stars)
plt.show();

Topics over usefulness

In [None]:
# load the usefulness variable for all the documents
usefulness_df = pd.read_csv(os.path.join( 'our_data', 'train.usefulness.csv'), header=0, index_col=0)
usefulness = usefulness_df.columns

# pull out a subset corresponding to the ids from above
train_subset = usefulness_df.loc[ids]
n_docs, n_usefulness = train_subset.shape

# plot the average usefulness-topic proportions
fig, ax = plt.subplots()
lefts = np.zeros(n_usefulness)
for k in range(n_topics):
    vals = []
    for useful in usefulness:
        vals.append(np.mean(theta[train_subset[useful] == 1, k]))

    ax.barh(range(n_usefulness), vals, left=lefts)
    lefts += np.array(vals)
    
ax.set_yticks(range(n_usefulness))
ax.set_yticklabels(usefulness)
plt.show();

Plot topics across brands

In [None]:
# load the brand variable for all the documents
brands_df = pd.read_csv(os.path.join( 'our_data', 'train.brand.csv'), header=0, index_col=0)
brands = brands_df.columns

# pull out a subset corresponding to the ids from above
train_subset = brands_df.loc[ids]
n_docs, n_brands = train_subset.shape

# plot the average brand-topic proportions
fig, ax = plt.subplots()
lefts = np.zeros(n_brands)
for k in range(n_topics):
    vals = []
    for brand in brands:
        vals.append(np.mean(theta[train_subset[brand] == 1, k]))

    ax.barh(range(n_brands), vals, left=lefts)
    lefts += np.array(vals)
    
ax.set_yticks(range(n_brands))
ax.set_yticklabels(brands)
plt.show();

Look at which topics predict which type

In [None]:
npz = np.load('output/topics_to_labels.npz')
probs = npz['probs']
label_names = npz['label']
n_topics, n_labels = probs.shape
print("Labels:", ' '.join([name for name in label_names]))
for k in range(n_topics):
    output = str(k) + ': '
    for i in range(n_labels):
        output += '%.4f ' % probs[k, i]
    print(output)