# Preparing the data for topic modelling

* Tokenize
* phrase formation for frequent collocations
* remove stopwords
* vectorize

This will be done for both the papers and the keywords, in order to be able to evaluate the recommendation system later.

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from pymongo import MongoClient
import pandas as pd
import spacy
import gensim
from gensim.models import phrases
from gensim.utils import lemmatize
import en_core_web_sm
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import pickle
from sklearn.externals import joblib
from nltk.stem.snowball import SnowballStemmer

In [45]:
client = MongoClient()
db = client.lingbuzz
papers = db.get_collection('papers')

In [83]:
db_with_papers = papers.find({ 'paper': { '$exists': True } })
df = pd.DataFrame(list(papers.find({ 'paper': { '$exists': True } }, {'paper':1, 'title':1})))

In [84]:
df.head()

Unnamed: 0,_id,paper,title
0,598b44c407d7df07719383e2,ANALYTIC PASSIVES IN CZECH Ludmila Veselovs...,Analytic Passives in Czech
1,598b44c407d7df07719383e5,UNIVERSAL DP-ANALYSIS IN ARTICLELESS LANGUAGE:...,The Universal DP Analysis in Articleless Langu...
2,598b44c407d7df07719383e8,Strong Pronominals in ASL and LSF* Philippe ...,Strong Pronominals ASL and LSF (squib)
3,598b44c407d7df07719383f0,THE UNIVERSITY OF CHICAGO INFLECTIONAL DEPEND...,Inflectional Dependencies. A study of complex ...
4,598b44c407d7df07719383fc,"Multiple Sluicing, Scope, and Superiority: Con...","Multiple Sluicing, Scope, and Superiority: Con..."


I want to maintain frequent collocations of terms (to keep things like 'distributed morphology', 'relativized minimality' etc. In order to do that:
* make a gensim phrase model that can identify bigrams and trigrams, based on all of the data.
* make a personalized tokenizer that tokenizes the docs and removes stopwords. This list of tokens is then run through the gensim phrase model to recover bi and trigrams.
* specify in the vectorizer that this personalized tokenizer should be used.

## Tokenize and vectorize the papers

In [5]:
nlp = en_core_web_sm.load()

__NOTE__: in linguistics papers, several 'stopwords' should not be considered stopwords, because they are meaningful in combination with other words. Ex:
* do-support
* it-cleft
* inversion around be

Solution: create my personalized linguistic paper tokenizer:
* create trigramer on lemmatized corpus with all stopwords included that makes bigrams our trigrams out of frequent collocations with gensim.Phrases. Set threshold high enough in order to avoid 'it is' and other frequent bigrams to be considered bigrams.
* tokenize, lemmatize each document
* run the trigrammer on document
* remove stopwords

This personalized tokenizer will be used to make the countvectors and tfidf vectors.

In [136]:
def build_trigram_model(corpus):
    corpus = lemmer(nlp(corpus))
    corpus = remove_stopwords2(corpus)
    bigram = phrases.Phrases(corpus, min_count=20, threshold=17)
    bigram_phraser = phrases.Phraser(bigram)
    trigram = phrases.Phrases(bigram_phraser[corpus], min_count=20, threshold=17)
    trigram_phraser = phrases.Phraser(trigram)
    return bigram_phraser, trigram_phraser

def punct_space(token):
    """
    helper function to eliminate punctuation, spaces and numbers.
    """
    return token.is_punct or token.is_space or token.like_num

#def remove_stopwords(tigrammized):
#    no_stop = [[term for term in sent if term not in my_stopwords] for sent in trigrammized]
 #   return no_stop
    
def remove_stopwords(stuff):
    # gives list of strings. Vectorizer needs this.
    out = []
    for sent in stuff:
        for term in sent:
            if term not in my_stopwords:
                out.append(term)
    return out

def remove_stopwords2(stuff):
    # gives list of list of strings. Phraser needs this.
    out = []
    for sent in stuff:
        out.append([term for term in sent if term not in my_stopwords])
    return out

def trigrammer(doc):
    tokens = nlp(doc)
    lemmas = lemmer(tokens)
    tokens_ = bigrams[lemmas]
    trigrammized = trigrams[tokens_]
    return [j for j in trigrammized]

def lemmer(tokens):
    """
    lemmatize words
    """
    word_space = []
    for sent in tokens.sents:
        sentence = []
        for token in sent:
            if not punct_space(token):
                if token.lemma_=='-PRON-':
                    sentence.append(token.lower_)
                else:
                    sentence.append(token.lemma_.strip('-'))
        word_space.append(sentence)
    return word_space

def my_tokenizer(doc):
    trigrammized = trigrammer(doc)
    no_stop = remove_stopwords(trigrammized)
    return no_stop

In [7]:
# for building the trigram model
corpus = str()
for text in df.paper:
    corpus += text

## Stopwords

In [106]:
my_stopwords = joblib.load('stopwords')

In [107]:
len(my_stopwords)

280906

In [None]:
corpus = str()
for text in df.paper:
    corpus += text

In [None]:
counts = Counter(remove_stopwords(lemmer(nlp(corpus))))

In [None]:
counts.most_common(200)[:50]

In [None]:
# based on common gibberish
stopwords2 = ['a.', 'b.', '’s', '=', 'john', 'c.', '+', '>', 'e.g.', 'i.e.', 'cf', '<', 'd.', 'e.', 'f.']

In [None]:
for w, c in list(counts.items()):
    if counts[w] < 10:
        stopwords2.append(w)

In [None]:
my_stopwords = set(list(my_stopwords) + stopwords2)

In [None]:
joblib.dump(my_stopwords, 'stopwords')

In [133]:
my_stopwords

{'benahm]i',
 'tr’al',
 '374a',
 'contraejemplo',
 'http://dx.doi.org/10.3765/sp.5.8',
 'r:.i',
 'sikerült',
 'ohtaki',
 '\u242a3',
 'denominadas',
 'circle.of.walls.f.pl',
 'f.sg.ptc)’',
 'jinling',
 'nobilitate',
 'v)(λv.↓kv',
 '1[full',
 'pp.31',
 '22thus',
 '205–217',
 '271-\xad‐311',
 'deﬁnitely.12',
 '111(4',
 'mischpisch',
 'é∑l\x00⇤',
 'report48',
 'ctudenti',
 'di1:i1',
 'submitted',
 'goldsmith-',
 'overwrite’.',
 'deleted.4',
 '他们全家人都有着相同的发质。',
 'closure.’',
 'change.1',
 '-referentially',
 'gêneront/rendent',
 'iszik',
 'exc(a)(w',
 'l=',
 'kuvas',
 '’polysemy',
 'meselor',
 'a775',
 'stell-=as',
 'fundamentalism',
 'ludovici',
 'h/she',
 'φ-defectiveness',
 'jedes',
 'udviklingen',
 'omitted',
 'shuﬄed',
 'author57',
 'romani',
 'generalization(s',
 'mężczyznom',
 'additive/inclusive',
 '2004:275–279',
 'volbrengen',
 '-n+ých',
 'p´otnik',
 'features.6',
 'di.xuan@yahoo.com',
 'phonoloc;y',
 'cewi',
 'algae',
 'interbreeding',
 'contração-!.<.',
 'occasion.26',
 'minimalit

## Phraser

In [8]:
bigram_phraser, trigram_phraser = build_trigram_model(corpus)

In [138]:
with open('trigrams.pkl', 'wb') as pkl:
    trigram_phraser.save(pkl)
with open('bigrams.pkl', 'wb') as pkl2:
    bigram_phraser.save(pkl2)

In [141]:
bigrams = phrases.Phrases.load('bigrams.pkl', 'rb')
trigrams = phrases.Phrases.load('trigrams.pkl', 'rb')

In [142]:
bigrams.phrasegrams

{(b'1a', b'1b'): (48, 249.00439312162672),
 (b'truth', b'value'): (246, 45.069528771397465),
 (b'ii', b'iii'): (146, 28.988531447433214),
 (b'non', b'finite'): (356, 28.81303841766284),
 (b'lexical', b'item'): (1319, 58.272768231972144),
 (b'verb', b'b\xc3\xbdt'): (33, 21.589685474834962),
 (b'b\xc3\xbdt', b'be\xe2\x80\x99'): (19, 1284.1169447640966),
 (b'linear', b'order'): (340, 40.57069800805949),
 (b'20th', b'century'): (40, 1185.9265901482545),
 (b'generative', b'grammar'): (530, 91.73478489061478),
 (b'czech', b'analytic'): (25, 109.6019933481398),
 (b'analytic', b'passive'): (41, 77.83961825925715),
 (b'linguistic', b'theory'): (1656, 19.991949916797463),
 (b'v1', b'v2'): (155, 338.9950951507752),
 (b'\xef\x83\x9e', b'b'): (18, 21.96007195240072),
 (b'subordinate', b'clause'): (449, 54.15121718913696),
 (b'detailed', b'discussion'): (181, 74.88769656115117),
 (b'widely', b'accept'): (56, 169.60436390670012),
 (b'prepositional', b'phrase'): (200, 30.46935485957073),
 (b'remain', 

In [12]:
num_partitions = 15 #number of partitions to split dataframe
num_cores = 15 #number of cores on your machine

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

## Vectorizing etc.

During EDA, it became clear that some papers were not properly converted (probably due to encodings). These will be eliminated from further analysis.

In [93]:
indices_to_eliminate = [223, 251, 253, 257, 260, 273, 462]

These indices are positional. So if we drop them, we want to start with the highest number, otherwise we will start dropping the wrong rows.

In [94]:
indices_to_eliminate = sorted(indices_to_eliminate, reverse=True)

In [95]:
for index in indices_to_eliminate:
    df.drop(index, inplace=True)

In [137]:
my_tokenizer(df.paper[450])

['appear',
 'natural',
 'linguistic_theory',
 'morpho_syntax',
 'silent',
 'wh',
 'expression',
 'wolof',
 'harold',
 'torrence',
 'abstract',
 'paper',
 'analyze',
 'morphology',
 'syntax',
 'wh',
 'expression',
 'agree_complementizer',
 'wolof',
 'atlantic',
 'argue',
 'wolof',
 'possess',
 'set',
 'null',
 'wh',
 'expression',
 'addition',
 'set',
 'overt',
 'wh',
 'expression',
 'null',
 'wh',
 'expression',
 'occur',
 'relative_clause',
 'like',
 'construction',
 'trigger',
 'agreement',
 'complementizer',
 'examine',
 'property',
 'null',
 'wh',
 'compare',
 'overt',
 'wh',
 'wolof',
 'provide_evidence',
 'null',
 'wh',
 'like',
 'overt',
 'wh',
 'successive_cyclically',
 'trigger',
 'agreement',
 'intermediate',
 'complementizer',
 'occur',
 'movement',
 'pathway',
 'compare',
 'wolof',
 'construction',
 'superficially_similar',
 'complementizer_agreement',
 'construction',
 'bantu',
 'kinande',
 'null_operator',
 'german',
 'wh',
 'drop',
 'dutch',
 'keywords',
 'wolof',
 'wh_m

In [148]:
my_tokenizer('wh_movement 0.24 language')

['wh_movement']

In [144]:
len(my_stopwords)

280906

In [145]:
list(my_stopwords).index('wh')

ValueError: 'wh' is not in list

In [174]:
vectorizer = CountVectorizer(tokenizer=my_tokenizer)
dtm_fit = vectorizer.fit_transform(df.paper)

In [149]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function my_tokenizer at 0x7f849198ac80>,
        vocabulary=None)

In [177]:
dtm_fit.shape
#vocab_[7000:7050]

(743, 60441)

In [186]:
vectorizer = CountVectorizer(tokenizer=my_tokenizer)
dtm = vectorizer.fit(df.paper)

In [187]:
from sklearn.externals import joblib
joblib.dump(dtm, 'update_vectorizer.pkl') 

['update_vectorizer.pkl']

In [219]:
joblib.dump(dtm.get_feature_names(), 'feature_names')

['feature_names']

In [188]:
dtm_count = dtm.transform(df.paper)

In [190]:
dtm_count.shape

(743, 60441)

In [112]:
#joblib.dump(dtm_count, 'countvector.pkl') 

['countvector.pkl']

In [178]:
joblib.dump(dtm_fit, 'countvector.pkl')

['countvector.pkl']

In [182]:
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(dtm_fit)
joblib.dump(tfidf, 'tfidf.pkl') 

['tfidf.pkl']

In the algorithm above, what counts as a bi- or trigram is decided on the corpus as a whole. It might be interesting to just use only the paper in question for deciding on what is a frequent collocation, because some authors invent their own collocations (I do for sure). However, this will not be very informative when it comes to topic modelling: if a frequent collocation only appears in one paper, it will not be informative for the topic model. Invented collocations often come into being to specify a sub-class of a well discussed linguistic object. We want to maintain the relation to this well known class, and for this to be possible, the well-known class should not be hidden in a constructed bi- or trigram.

## Keyword vectors
__Evaluation of the models__: http://sujitpal.blogspot.com/2016/12/document-similarity-using-various-text.html  
I have the keywords for each paper (about 8 keywords for each), which should summarize the topics pretty well. Also, they give us an indication of the similarity between different papers. If we calculate the similarity between the keyword vectors for each document, and compare it to the similarity between the papers, we get an idea of how well the model is doing.

In [154]:
db_with_papers = papers.find({ 'paper': { '$exists': True } })
df2 = pd.DataFrame(list(papers.find({ 'paper': { '$exists': True } }, {'keywords':1, 'title':1})))

In [155]:
df2.head()

Unnamed: 0,_id,keywords,title
0,598b44c407d7df07719383e2,"[czech passives, passive vs. past participles,...",Analytic Passives in Czech
1,598b44c407d7df07719383e5,"[czech dp; universal dp, determiners; function...",The Universal DP Analysis in Articleless Langu...
2,598b44c407d7df07719383e8,"[sign language, strong pronouns, pointing, foc...",Strong Pronominals ASL and LSF (squib)
3,598b44c407d7df07719383f0,"[syntax, morphology, extended projections, sel...",Inflectional Dependencies. A study of complex ...
4,598b44c407d7df07719383fc,"[sluicing, ellipsis licensing, pair-list readi...","Multiple Sluicing, Scope, and Superiority: Con..."


In [156]:
for index in indices_to_eliminate:
    df2.drop(index, inplace=True)

In [157]:
df2['keywords2'] = [row[0] for row in df2.keywords]

In [158]:
df2.head()

Unnamed: 0,_id,keywords,title,keywords2
0,598b44c407d7df07719383e2,"[czech passives, passive vs. past participles,...",Analytic Passives in Czech,"czech passives, passive vs. past participles, ..."
1,598b44c407d7df07719383e5,"[czech dp; universal dp, determiners; function...",The Universal DP Analysis in Articleless Langu...,"czech dp; universal dp, determiners; functiona..."
2,598b44c407d7df07719383e8,"[sign language, strong pronouns, pointing, foc...",Strong Pronominals ASL and LSF (squib),"sign language, strong pronouns, pointing, focu..."
3,598b44c407d7df07719383f0,"[syntax, morphology, extended projections, sel...",Inflectional Dependencies. A study of complex ...,"syntax, morphology, extended projections, sele..."
4,598b44c407d7df07719383fc,"[sluicing, ellipsis licensing, pair-list readi...","Multiple Sluicing, Scope, and Superiority: Con...","sluicing, ellipsis licensing, pair-list readin..."


In [159]:
keywords = str()
for row in df2.keywords2:
    keywords += ', ' + row

In [161]:
tokenized_keywords = my_tokenizer(keywords)

In [162]:
len(set(tokenized_keywords))

1929

In [163]:
tokenized_keywords
joblib.dump(tokenized_keywords, 'tokenized_keywords')

['tokenized_keywords']

The lemmatizer is not ideal.... for instance, it turns 'left' (as in left vs. right) into 'leave', so we get leave_periphery instead of left periphery. The phraser does a good job though.  
There are 2226 distinct tokenized keywords. There are 2436 distinct non-tokenized keywords. How many topics is that?

In [165]:
vectorizer = CountVectorizer(tokenizer=my_tokenizer)
dtm_keywords = vectorizer.fit_transform(df2.keywords2)

In [166]:
joblib.dump(dtm_keywords, 'countvector_keywords.pkl') 

['countvector_keywords.pkl']

In [167]:
tfidf_keywords = transformer.fit_transform(dtm_keywords)
joblib.dump(tfidf_keywords, 'tfidf_keywords.pkl') 

['tfidf_keywords.pkl']

In [169]:
vectorizer.get_feature_names()

['',
 "'s",
 "'s_law",
 "a'-dependency",
 "a'-head",
 "a'-movement",
 'a/a_bar_distinction',
 'aae',
 'aba',
 'absentive',
 'absolutive',
 'abstract',
 'accent',
 'acceptability_judgment',
 'acceptability_judgment_experiment',
 'accidental_homophony',
 'accusative',
 'acd',
 'achain',
 'acoustic',
 'acquisition',
 'action',
 'activeness',
 'activity_condition',
 'acyclic',
 'additive',
 'adjacency',
 'adjectival',
 'adjectival_modification',
 'adjective',
 'adjunct',
 'adjunct_island',
 'adjunction',
 'adnominal',
 'adposition',
 'adverb',
 'adverbial',
 'adverbial_clause',
 'adverbs',
 'adversity_causatif',
 'affix',
 'affix_hopping',
 'affixation',
 'affrication',
 'agency',
 'agent',
 'agentivity',
 'agglutinative',
 'agree',
 'agree_possessor',
 'agreement',
 'ai',
 'air',
 'akmajian',
 'aktionsart',
 'albanian',
 'algebra',
 'algebraic_semantic',
 'algonquian',
 'algorithm',
 'alignment',
 'allomorph_selection',
 'allomorphy',
 'allomorphy_allosemy',
 'altaic',
 'alternate',
 'alt

## LDA pickles
For LDA, we need different stuff...

In [192]:
counts = dtm_count.transpose()
id2word = dict((v, k) for k, v in dtm.vocabulary_.items())

In [193]:
joblib.dump(counts, 'lda_counts.pkl')

['lda_counts.pkl']

In [194]:
with open('lda_dict.pkl', 'wb') as f:
    pickle.dump(id2word, f)

LDA allows for updating. This is useful if you want an online recommender system that is based on topic modeling. However, for the LDA to be updated, the document needs to be in the exact same format as the documents that were used for the previous training. So, we need a count vector of the same format. To get this: fit a model: learn a vocabulary dictionary of all tokens in the raw documents. Pickle this one and use it for transformation of new documents, which were previously run trhough the tokenizer.

In [131]:
my_tokenizer('wh_movement 1.24 1.534 \x06ibedab\x03ve.a\x00\x00e\x01bedd')

['wh_movement', '\x06ibedab\x03ve.a\x00\x00e\x01bedd']

In [215]:
len(set(my_tokenizer(corpus)))

65239

In [None]:
# set of words in tokenized corpus is bigger than in dtm. Weird. 