In [266]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.externals import joblib
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import NMF
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer 
import gensim
from gensim.models import phrases

def to_similarity_matrix(fitted):
    member_list = []
    for doc, topic in enumerate(list(fitted)):
        member_list.append([doc, topic, 1])
    df = pd.DataFrame(member_list, columns = ['doc', 'topic', 'member'])
    return df.pivot_table(df, index=['doc'], columns=['topic']).fillna(0)        

def compute_topic_keyword_correlation(topic_space, keyword_vector):
    sim = to_similarity_matrix(topic_space)
    cs_doc = cosine_similarity(sim)
    cs_key = cosine_similarity(keyword_vector)
    ut_doc = np.triu(cs_doc, k=1).flatten()
    ut_key = np.triu(cs_key, k=1).flatten()
    print(np.corrcoef(ut_doc, ut_key)[0,1])
    
def compute_similarities(doc_pkl, keyword_pkl):
    """ takes doc vectors and keyword vectors, computes cosine similarity and returns pearson correlation 
    between similarities. The higher the correltion, the better the topic model"""
    dtm_doc = joblib.load(doc_pkl)
    dtm_key = joblib.load(keyword_pkl)
    cs_doc = cosine_similarity(dtm_doc)
    cs_key = cosine_similarity(dtm_key)
    # in order to maintain single correlations and eliminate correlations with itself + flatten for calculating pearson
    ut_doc = np.triu(cs_doc, k=1).flatten()
    ut_key = np.triu(cs_key, k=1).flatten()
    return np.corrcoef(ut_doc, ut_key)[0,1]

In [3]:
dtm_count = joblib.load('countvector.pkl')
dtm_tfidf = joblib.load('tfidf.pkl')
dtm_lda = joblib.load('lda_counts.pkl')
lda_dict = joblib.load('lda_dict.pkl')
count_keywords = joblib.load('countvector_keywords.pkl')
tfidf_keywords = joblib.load('tfidf_keywords.pkl')

In [22]:
print(compute_similarities('countvector.pkl', 'countvector_keywords.pkl'))
print(compute_similarities('tfidf.pkl', 'tfidf_keywords.pkl'))

0.612346784837
0.408467061703


In [73]:
client = MongoClient()
db = client.lingbuzz
papers = db.get_collection('papers')
db_with_papers = papers.find({ 'paper': { '$exists': True } })
df = pd.DataFrame(list(papers.find({ 'paper': { '$exists': True } }, {'paper':1, 'title':1, 'keywords':1})))

In [75]:
indices_to_eliminate = sorted([223, 251, 253, 257, 260, 273, 462], reverse=True)
for index in indices_to_eliminate:
    df.drop(index, inplace=True)
df.head()

Unnamed: 0,_id,keywords,paper,title
0,598b44c407d7df07719383e2,"[czech passives, passive vs. past participles,...",ANALYTIC PASSIVES IN CZECH Ludmila Veselovs...,Analytic Passives in Czech
1,598b44c407d7df07719383e5,"[czech dp; universal dp, determiners; function...",UNIVERSAL DP-ANALYSIS IN ARTICLELESS LANGUAGE:...,The Universal DP Analysis in Articleless Langu...
2,598b44c407d7df07719383e8,"[sign language, strong pronouns, pointing, foc...",Strong Pronominals in ASL and LSF* Philippe ...,Strong Pronominals ASL and LSF (squib)
3,598b44c407d7df07719383f0,"[syntax, morphology, extended projections, sel...",THE UNIVERSITY OF CHICAGO INFLECTIONAL DEPEND...,Inflectional Dependencies. A study of complex ...
4,598b44c407d7df07719383fc,"[sluicing, ellipsis licensing, pair-list readi...","Multiple Sluicing, Scope, and Superiority: Con...","Multiple Sluicing, Scope, and Superiority: Con..."


## 1. DBSCAN

In [23]:
db = DBSCAN(eps = 0.6, min_samples=2, metric='cosine', algorithm='brute')
fit_db = db.fit_predict(dtm_count)
Counter(fit_db)

Counter({-1: 128, 0: 598, 1: 2, 2: 2, 3: 2, 4: 4, 5: 3, 6: 2, 7: 2})

In [24]:
compute_topic_keyword_correlation(fit_db, count_keywords)

0.465488775695


In [19]:
db2 = DBSCAN(eps = 0.8, min_samples=2, metric='cosine', n_jobs=-1, algorithm='brute')
fit_db2 = db2.fit_predict(dtm_tfidf)
Counter(fit_db2)

Counter({-1: 142,
         0: 541,
         1: 9,
         2: 3,
         3: 2,
         4: 3,
         5: 2,
         6: 2,
         7: 3,
         8: 2,
         9: 2,
         10: 6,
         11: 3,
         12: 3,
         13: 3,
         14: 2,
         15: 2,
         16: 2,
         17: 2,
         18: 2,
         19: 3,
         20: 2,
         21: 2})

In [20]:
compute_topic_keyword_correlation(fit_db2, count_keywords)

0.395949602801


## 2. Agglomerative clustering

In [163]:
hc = AgglomerativeClustering(n_clusters = 4, affinity = 'cosine', linkage='complete')

In [164]:
fitted = hc.fit_predict(dtm_tfidf.toarray())
Counter(fitted)

Counter({0: 5, 1: 7, 2: 59, 3: 672})

In [165]:
compute_topic_keyword_correlation(fitted, count_keywords)

0.58607942071


In [166]:
fitted2 = hc.fit_predict(dtm_count.toarray())
Counter(fitted2)

Counter({0: 3, 1: 723, 2: 15, 3: 2})

In [167]:
compute_topic_keyword_correlation(fitted2, count_keywords)

0.574515079469


Tfidf gives more balanced topics, but most papers are allocated to the same cluster.

## 3. NMF
50 topics:

In [46]:
nmf = NMF(n_components=50)
nmf.fit(dtm_tfidf)
transformed = nmf.transform(dtm_tfidf)

In [47]:
joblib.dump(transformed, 'nmf.pkl')

['nmf.pkl']

In [49]:
nmf = NMF(n_components=20)
nmf.fit(dtm_tfidf)
transformed = nmf.transform(dtm_tfidf)
joblib.dump(transformed, 'nmf_reduced.pkl')

['nmf_reduced.pkl']

In [53]:
feature_names = joblib.load('feature_names')

In [54]:
words = [(i, v) for i, v in enumerate(feature_names)]
for r in nmf.components_:
    a = sorted([(v, i) for i,v in enumerate(r)])[-7:]
    print([words[j[1]] for j in a])

[(19849, 'event'), (49752, 'structure'), (53839, 'unaccusative'), (55733, 'vp'), (10980, 'causative'), (6768, 'argument'), (55101, 'verb')]
[(40400, 'phonological'), (40433, 'phonology'), (13373, 'consonant'), (49625, 'stress'), (56919, 'word'), (50586, 'syllable'), (55706, 'vowel')]
[(26023, 'icelandic'), (36912, 'nom'), (22672, 'genitive'), (14936, 'dative'), (4695, 'accusative'), (36986, 'nominative'), (10807, 'case')]
[(48954, 'spell'), (24564, 'head'), (34229, 'merge'), (55733, 'vp'), (54675, 'v'), (35313, 'movement'), (40270, 'phase')]
[(49752, 'structure'), (21583, 'form'), (12972, 'compound'), (55101, 'verb'), (41771, 'prefix'), (36959, 'nominal'), (45615, 'root')]
[(8053, 'bantu'), (5341, 'agreement'), (35782, 'n'), (23427, 'grammatical_gender'), (37332, 'noun'), (20921, 'feature'), (22581, 'gender')]
[(54545, 'use'), (25649, 'human'), (31972, 'linguistic'), (31268, 'learner'), (34806, 'model'), (4785, 'acquisition'), (11438, 'child')]
[(41231, 'position'), (37522, 'null_subje

In [55]:
topic_space = [list(in_list).index(max(in_list)) for in_list in transformed]

In [56]:
Counter(topic_space)

Counter({0: 38,
         1: 56,
         2: 34,
         3: 64,
         4: 46,
         5: 13,
         6: 75,
         7: 25,
         8: 63,
         9: 31,
         10: 10,
         11: 30,
         12: 40,
         13: 33,
         14: 24,
         15: 15,
         16: 14,
         17: 54,
         18: 64,
         19: 14})

Looks pretty good. Next step: classify in 4 topics. Then classify again. Look into LDA.

In [315]:
nmf_super = NMF(n_components=6)
nmf_super.fit(dtm_tfidf)
transformed_super = nmf_super.transform(dtm_tfidf)

In [429]:
words = [(i, v) for i, v in enumerate(feature_names)]
relevant_words_super = []
for r in nmf_super.components_:
    a = sorted([(v, i) for i,v in enumerate(r)])[-20:]
    print([words[j[1]] for j in a])
    print('\n')
    relevant_words_super.append([words[j[1]] for j in a])
    

[(18911, 'english'), (10807, 'case'), (46607, 'scope'), (27542, 'interpretation'), (42537, 'property'), (31972, 'linguistic'), (47281, 'set'), (46360, 'say'), (52286, 'time'), (33983, 'meaning'), (19849, 'event'), (36334, 'negation'), (13521, 'context'), (48776, 'speaker'), (42487, 'pronoun'), (31840, 'like'), (54545, 'use'), (34794, 'modal'), (47048, 'semantic'), (47163, 'sentence')]


[(17594, 'e'), (49384, 'stem'), (35782, 'n'), (21583, 'form'), (51925, 'theory'), (13413, 'constraint'), (34806, 'model'), (52561, 'tone'), (28521, 'japanese'), (55555, 'voice'), (40373, 'phonetic'), (43585, 'r'), (20202, 'experiment'), (49625, 'stress'), (13373, 'consonant'), (40400, 'phonological'), (50586, 'syllable'), (40433, 'phonology'), (56919, 'word'), (55706, 'vowel')]


[(40078, 'person'), (55101, 'verb'), (17166, 'dp'), (4600, 'acc'), (6768, 'argument'), (14898, 'dat'), (37779, 'object'), (54675, 'v'), (5327, 'agree'), (36912, 'nom'), (19234, 'ergative'), (22672, 'genitive'), (20921, 'feature

In [201]:
# function for assigning topics to docs:
def assign_topic(df, transformed, fitted):
    doctopic = transformed
    # scale the document-component matrix such that the component values associated with each document sum to one
    doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)
    titles = np.array(df.title)
    num_groups = len(set(titles))
    # make matrix with zeros of the right size
    doctopic_grouped = np.zeros((num_groups, len(fitted.components_)))
    for i, title in enumerate(sorted(set(titles))):
        doctopic_grouped[i, :] = np.mean(doctopic[titles == title], axis = 0)
    return doctopic_grouped

In [202]:
doc_topic = assign_topic(df, transformed_super, nmf_super)

In [203]:
topic_assignment = []
for i, title in enumerate(df.title):
    topic_assignment.append([title]+ list(doc_topic[i])) 
topic_df = pd.DataFrame(topic_assignment, columns = ['title']+ ['topic'+str(n) for n in range(doc_topic.shape[1])])

In [204]:
topic_df

Unnamed: 0,title,topic0,topic1,topic2,topic3,topic4,topic5
0,Analytic Passives in Czech,0.239846,0.219854,0.000000,0.279531,0.044093,0.216676
1,The Universal DP Analysis in Articleless Langu...,0.050512,0.000000,0.623094,0.326395,0.000000,0.000000
2,Strong Pronominals ASL and LSF (squib),0.440461,0.000000,0.086795,0.153694,0.000000,0.319050
3,Inflectional Dependencies. A study of complex ...,0.069382,0.002147,0.013085,0.867917,0.047469,0.000000
4,"Multiple Sluicing, Scope, and Superiority: Con...",0.276230,0.000000,0.430447,0.112341,0.000000,0.180982
5,Quantifier Domain Restriction as Ellipsis,0.087037,0.033598,0.094693,0.061134,0.244577,0.478960
6,Two disjunctions in Mandarin Chinese,0.362156,0.031027,0.000000,0.271608,0.000000,0.335208
7,Iconic Pragmatics,0.044344,0.164368,0.000000,0.446883,0.071364,0.273041
8,Moraic Onsets in Arrernte,0.000000,0.000000,0.000000,0.000000,0.015398,0.984602
9,Revolutionary New Ideas Appear Infrequently,0.268715,0.129339,0.080005,0.269500,0.252441,0.000000


Waaaw... This is great! I can recognise fields based on the most relevant words, and the titles match them pretty well...

In [205]:
joblib.dump(topic_df, 'topic_space_papers')

['topic_space_papers']

## Refining topic 0 and 3

### Creating the tfidf vectors for topic0 and topic3

In [267]:
import spacy

nlp = spacy.load('en_core_web_sm')
my_stopwords = joblib.load('stopwords')
bigrams = phrases.Phrases.load('bigrams.pkl', 'rb')
trigrams = phrases.Phrases.load('trigrams.pkl', 'rb')

def punct_space(token):
    """
    helper function to eliminate punctuation, spaces and numbers.
    """
    return token.is_punct or token.is_space or token.like_num

#def remove_stopwords(tigrammized):
#    no_stop = [[term for term in sent if term not in my_stopwords] for sent in trigrammized]
 #   return no_stop
    
def remove_stopwords(stuff):
    # gives list of strings. Vectorizer needs this.
    out = []
    for sent in stuff:
        for term in sent:
            if term not in my_stopwords:
                out.append(term)
    return out


def trigrammer(doc):
    tokens = nlp(doc)
    lemmas = lemmer(tokens)
    tokens_ = bigrams[lemmas]
    trigrammized = trigrams[tokens_]
    return [j for j in trigrammized]

def lemmer(tokens):
    """
    lemmatize words
    """
    word_space = []
    for sent in tokens.sents:
        sentence = []
        for token in sent:
            if not punct_space(token):
                if token.lemma_=='-PRON-':
                    sentence.append(token.lower_)
                else:
                    sentence.append(token.lemma_.strip('-'))
        word_space.append(sentence)
    return word_space

def my_tokenizer(doc):
    trigrammized = trigrammer(doc)
    no_stop = remove_stopwords(trigrammized)
    return no_stop

In [258]:
topic0 = []
for i, row in topic_df.iterrows():
    max_prob = row[1:].max()
    if row.topic0 == max_prob:
        topic0.append(row.title)

In [259]:
topic3 = []
for i, row in topic_df.iterrows():
    max_prob = row[1:].max()
    if row.topic3 == max_prob:
        topic3.append(row.title)

In [261]:
df_topic0 = df[df['title'].isin(topic0)]

In [262]:
df_topic3 = df[df['title'].isin(topic3)]

In [281]:
vectorizer = TfidfVectorizer(tokenizer=my_tokenizer)
tfidf_topic0_fit = vectorizer.fit(df_topic0.paper)
feature_names0 = tfidf_topic0_fit.get_feature_names()
tfidf_topic0 = tfidf_topic0_fit.transform(df_topic0.paper)

In [282]:
tfidf_topic3_fit = vectorizer.fit(df_topic3.paper)
feature_names3 = tfidf_topic3_fit.get_feature_names()
tfidf_topic3 = tfidf_topic3_fit.transform(df_topic3.paper)

### NFM topic 0

In [299]:
nmf_sub0 = NMF(n_components=6)
nmf_sub0.fit(tfidf_topic0)
transformed_sub0 = nmf_sub0.transform(tfidf_topic0)

In [430]:
words = [(i, v) for i, v in enumerate(feature_names0)]
relevant_words0 = []
for r in nmf_sub0.components_:
    a = sorted([(v, i) for i,v in enumerate(r)])[-20:]
    print([words[j[1]] for j in a])
    print('\n')
    relevant_words0.append([words[j[1]] for j in a])

[(30126, 'syntax'), (23759, 'phase_head'), (7908, 'cp'), (30107, 'syntactic'), (8895, 'derivation'), (5799, 'case'), (6227, 'chomsky'), (5479, 'c'), (32707, 'verb'), (22573, 'operation'), (29017, 'spell'), (9825, 'dp'), (33108, 'vp'), (20818, 'movement'), (29526, 'structure'), (14304, 'head'), (12033, 'feature'), (20117, 'merge'), (23755, 'phase'), (32457, 'v')]


[(27145, 'rule'), (17857, 'l'), (23831, 'phonetic'), (21068, 'n'), (7358, 'conjugation'), (18496, 'level'), (11588, 'experiment'), (12461, 'form'), (30787, 'theory'), (29284, 'stem'), (9964, 'duration'), (30057, 'syllable'), (27036, 'root'), (32994, 'voice'), (25755, 'r'), (33776, 'word'), (23854, 'phonological'), (7442, 'consonant'), (23883, 'phonology'), (33087, 'vowel')]


[(23326, 'past'), (19963, 'meaning'), (32707, 'verb'), (16187, 'interpretation'), (26991, 'role_shift'), (29214, 'state'), (5902, 'causation'), (28891, 'speaker'), (27461, 'say'), (7537, 'context'), (20959, 'mwangi'), (18673, 'like'), (32399, 'use'), (25

In [301]:
doc_topic0 = assign_topic(df_topic0, transformed_sub0, nmf_sub0)
topic_assignment0 = []
for i, title in enumerate(df_topic0.title):
    topic_assignment0.append([title]+ list(doc_topic0[i])) 
topi0c_df = pd.DataFrame(topic_assignment0, columns = ['title']+ ['topic'+str(n) for n in range(doc_topic0.shape[1])])

In [302]:
topi0c_df

Unnamed: 0,title,topic0,topic1,topic2,topic3,topic4,topic5
0,Strong Pronominals ASL and LSF (squib),0.242146,0.000000,0.000000,0.418682,0.060101,0.279070
1,Two disjunctions in Mandarin Chinese,0.080660,0.137321,0.099223,0.569188,0.075028,0.038580
2,Strict and non-strict negative concord in Hung...,0.000000,0.158650,0.104324,0.026457,0.674073,0.036495
3,On the unavailability of argument ellipsis in ...,0.018527,0.000000,0.000000,0.137416,0.048819,0.795239
4,Nominal ellipses,0.000000,0.000000,0.000000,0.422454,0.577546,0.000000
5,The loi de position and the acoustics of Frenc...,0.730148,0.000000,0.000000,0.000000,0.269852,0.000000
6,An Argument for Zwart’s Merge. Quotation as a ...,0.033654,0.134698,0.337066,0.224740,0.161457,0.108385
7,Why the null complementizer is special in the ...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
8,"Events, States and Times. An essay on narrativ...",0.000000,0.056454,0.000000,0.000000,0.943546,0.000000
9,On the (un)interpretability of phi-agreement,0.633239,0.000000,0.000000,0.000000,0.329882,0.036879


In [313]:
joblib.dump(topi0c_df, 'topic_space_topic0')

['topic_space_topic0']

In [309]:
nmf_sub3 = NMF(n_components=5)
nmf_sub3.fit(tfidf_topic3)
transformed_sub3 = nmf_sub3.transform(tfidf_topic3)

In [431]:
words3 = [(i, v) for i, v in enumerate(feature_names3)]
relevant_words3 = []
for r in nmf_sub3.components_:
    a = sorted([(v, i) for i,v in enumerate(r)])[-20:]
    print([words3[j[1]] for j in a])
    print('\n')
    relevant_words3.append([words3[j[1]] for j in a])

[(22846, 'object'), (2275, 'agreement'), (3207, 'argument'), (9773, 'dp'), (7786, 'cp'), (6456, 'clause'), (2720, 'analysis'), (32425, 'tp'), (30549, 'structure'), (5324, 'c'), (12090, 'feature'), (24385, 'phase'), (11815, 'extraction'), (25005, 'position'), (14420, 'head'), (33725, 'v'), (34402, 'vp'), (33996, 'verb'), (30680, 'subject'), (21377, 'movement')]


[(18869, 'level'), (25561, 'principle'), (25627, 'probability'), (11449, 'evolution'), (31191, 'syntax'), (6128, 'chomsky'), (18673, 'learning'), (15140, 'human'), (5953, 'change'), (31926, 'theory'), (13684, 'grammar'), (31167, 'syntactic'), (30549, 'structure'), (19172, 'linguistic'), (33638, 'use'), (35157, 'word'), (6084, 'child'), (23745, 'parameter'), (1915, 'acquisition'), (21016, 'model')]


[(22740, 'numeral'), (9773, 'dp'), (16989, 'japanese'), (15331, 'icelandic'), (3207, 'argument'), (8284, 'dat'), (12090, 'feature'), (13158, 'gen'), (31154, 'syncretism'), (33725, 'v'), (20156, 'mark'), (2275, 'agreement'), (22317, 

In [311]:
doc_topic3 = assign_topic(df_topic3, transformed_sub3, nmf_sub3)
topic_assignment3 = []
for i, title in enumerate(df_topic3.title):
    topic_assignment3.append([title]+ list(doc_topic3[i])) 
topic3_df = pd.DataFrame(topic_assignment3, columns = ['title']+ ['topic'+str(n) for n in range(doc_topic3.shape[1])])
topic3_df

Unnamed: 0,title,topic0,topic1,topic2,topic3,topic4
0,Analytic Passives in Czech,0.780837,0.022183,0.005608,0.089887,0.101485
1,Inflectional Dependencies. A study of complex ...,0.102522,0.000000,0.212420,0.387286,0.297772
2,Iconic Pragmatics,0.338990,0.083755,0.000000,0.113451,0.463805
3,Revolutionary New Ideas Appear Infrequently,0.244317,0.201997,0.129545,0.385048,0.039092
4,"Back to the Future: Non-generation, filtration...",0.053216,0.000000,0.440742,0.310992,0.195050
5,AxParts and Case in Complex PPs: Microvariatio...,0.741820,0.000000,0.258180,0.000000,0.000000
6,Ma non era rosso? (But wasn’t it red?): On cou...,0.373946,0.133963,0.018506,0.421879,0.051705
7,Modeling syntactic acquisition,0.120584,0.017049,0.118059,0.639357,0.104951
8,The scope of alternatives: Indefiniteness and ...,0.165062,0.023044,0.388935,0.088507,0.334452
9,Iconic Plurality,0.092438,0.017918,0.142365,0.747279,0.000000


In [314]:
joblib.dump(topic3_df, 'topic_space_topic3')

['topic_space_topic3']

### JSON for visualisation: hierarchy

In [360]:
from collections import defaultdict
def lists_of_words(topic_words):
    out = defaultdict(list)
    for j, topic in enumerate(topic_words):
        out[j] = [topic[i][1] for i in range(len(topic))]
    return out


In [444]:
words_0 = []
for topic in relevant_words0: 
    new_value = []
    size = 20
    for word in topic:
        di = {'name': word[1], 'size': size}
        size -= 1
        new_value.append(di)
    words_0.append(new_value)

In [445]:
words_3 = []
for topic in relevant_words3: 
    new_value = []
    size = 20
    for word in topic:
        di = {'name': word[1], 'size': size}
        size -= 1
        new_value.append(di)
    words_3.append(new_value)

In [446]:
super_words = []
for topic in relevant_words_super: 
    new_value = []
    size = 20
    for word in topic:
        di = {'name': word[1], 'size': size}
        size -= 1
        new_value.append(di)
    super_words.append(new_value)

In [448]:
word_spaces = {'name': 'topics', 'children': [{'name':'interfaces', 'children': [{'name':'minimalism', 'children': words_0[0]},
                {'name':'experimental phonology', 'children': words_0[1]},
                {'name': 'tense & aspect', 'children': words_0[2]},
                {'name':'case', 'children': words_0[3]},
                {'name': 'quantification', 'children': words_0[4]},
                        {'name': 'ellipsis', 'children': words_0[5]}]},
               {'name':'phonology & phonetics', 'children': super_words[1]},
               {'name':'morphosyntax', 'children': super_words[2]},
               {'name':'syntax', 'children':  [{'name': 'syntax', 'children': words_3[0]},
                          {'name': 'generative framework', 'children': words_3[1]},
                          {'name':'nanosyntax', 'children': words_3[2]},
                          {'name':'semantics', 'children': words_3[3]},
                          {'name':'DP', 'children': words_3[4]}]},
               {'name':'distributed morphology', 'children': super_words[4]},
               {'name':'morphology', 'children': super_words[5]}]}

In [449]:
word_spaces

{'children': [{'children': [{'children': [{'name': 'syntax', 'size': 20},
      {'name': 'phase_head', 'size': 19},
      {'name': 'cp', 'size': 18},
      {'name': 'syntactic', 'size': 17},
      {'name': 'derivation', 'size': 16},
      {'name': 'case', 'size': 15},
      {'name': 'chomsky', 'size': 14},
      {'name': 'c', 'size': 13},
      {'name': 'verb', 'size': 12},
      {'name': 'operation', 'size': 11},
      {'name': 'spell', 'size': 10},
      {'name': 'dp', 'size': 9},
      {'name': 'vp', 'size': 8},
      {'name': 'movement', 'size': 7},
      {'name': 'structure', 'size': 6},
      {'name': 'head', 'size': 5},
      {'name': 'feature', 'size': 4},
      {'name': 'merge', 'size': 3},
      {'name': 'phase', 'size': 2},
      {'name': 'v', 'size': 1}],
     'name': 'minimalism'},
    {'children': [{'name': 'rule', 'size': 20},
      {'name': 'l', 'size': 19},
      {'name': 'phonetic', 'size': 18},
      {'name': 'n', 'size': 17},
      {'name': 'conjugation', 'size': 16

In [450]:
import json
with open('words_per_topic.json', 'w') as f:
    json.dump(word_spaces, f)