## Calculate keyword similarity

In [1]:
from gensim.models import Word2Vec
from gensim.models import phrases
from pymongo import MongoClient
import pandas as pd
import en_core_web_sm
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.externals import joblib
import spacy
import re
import string
from multiprocessing import Pool
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
standard_stopwords = set(list(ENGLISH_STOP_WORDS)+list(stopwords.words('english')))

In [2]:
client = MongoClient()
db = client.lingbuzz
keywords = db.get_collection('keywords')

I have a file with all the vocabulary in my corpus and their wordvectors calculated by fastText. I will convert this into a dictionary for easy lookup.

In [3]:
voc_vectors = {}
with open('voc_vectors.txt', 'rb') as f:
    content = f.readlines()

In [4]:
for line in content:
    line = line.decode("utf-8").split(" ", 1)
    voc_vectors[line[0]] = {'vector': np.fromstring(line[1].strip(), sep=' '), 'sentenceIDs' : []}

In [13]:
# the formula used here is wrong. According to the paper it should be the sum of the similarities 
# between the top n most similar words...

# def create_df_rel_cs(vectors, ids):
#     """calculates relative cosine distance between two sentences and returns df with sentenceids and their distance"""
#     cos_sim = cosine_similarity(np.asarray(vectors))
#     sum_cs = np.sum(cos_sim, 1)[0]
#     rel_cs = cos_sim / sum_cs
#     df = pd.DataFrame(rel_cs, index = ids, columns = ids)
#     return df

def create_df_cs(vectors, ids):
    """calculates relative cosine distance between two sentences and returns df with sentenceids and their distance"""
    cos_sim = cosine_similarity(np.asarray(vectors))
    df = pd.DataFrame(cos_sim, index = ids, columns = ids)
    return df

## Keyword similarity

Calculate similarities and set similarity threshold.

In [7]:
kwords = []
keyword_vec = []
for entry in voc_vectors:
    if entry not in standard_stopwords:
        kwords.append(entry)
        keyword_vec.append(voc_vectors[entry]['vector'])

In [9]:
keyword_similarities = create_df_cs(keyword_vec, kwords)

In [10]:
keyword_similarities.head()

Unnamed: 0,analytic,passives,czech,ludmila,veselovská,petr,introduction,defining,problem,like,...,aame,rsp,aitem,bitem,diagonals,lpassives,npassives,boty,kingnom,zmar
analytic,1.0,0.327499,0.136207,0.118241,0.201351,0.161108,0.318767,0.409865,0.340559,0.178999,...,0.066546,0.091149,0.200329,0.139,0.391649,0.331104,0.341165,0.159959,0.107133,0.120273
passives,0.327499,1.0,0.07121,0.030514,0.099147,0.060225,0.174664,0.196529,0.269647,0.322238,...,0.18151,0.193179,0.215486,0.189093,0.283702,0.851886,0.869041,0.058934,0.143894,0.155694
czech,0.136207,0.07121,1.0,0.473511,0.636377,0.609529,0.162274,0.229002,0.172338,0.134997,...,0.117394,0.081306,0.090537,0.075196,0.084607,0.078563,0.104193,0.315703,0.200767,0.312162
ludmila,0.118241,0.030514,0.473511,1.0,0.584787,0.578897,0.099382,0.09197,0.133546,0.058865,...,0.063211,-0.007784,0.037557,0.120576,0.142093,0.037525,0.033915,0.324045,0.244663,0.250185
veselovská,0.201351,0.099147,0.636377,0.584787,1.0,0.614322,0.121936,0.17176,0.171601,0.141154,...,0.113671,0.093452,0.252779,0.137046,0.155786,0.124676,0.132634,0.320539,0.291361,0.34253


In [11]:
synonyms = []
headers = list(keyword_similarities.columns)
for i, row in keyword_similarities.head(20).iterrows():
    most_similar = sorted(row.drop(i))[-10:]
    columns = list(row.isin(most_similar))
    synonyms.append((i, list(compress(headers, columns)), most_similar))

0.6 seems like a reasonable threshold: words with a similarity higher than this number will be included in the most-similar entry in the keyword database.

Very high similarity are related words, true synonyms have lower similarity. For list of related keywords: eliminate stopwords.

In [32]:
from itertools import compress
synonyms = {}
headers = list(keyword_similarities.columns)
for i, row in keyword_similarities.iterrows():
    most_similar = [x for x in row.drop(i) if x >= 0.635]
    columns = list(row.isin(most_similar))
    synonyms[i] =  list(compress(headers, columns))

In [34]:
synonyms['czech']

['veselovská', 'prague', 'brno', 'praha', 'olomouc', 'ladislav', 'slovak']

In [10]:
joblib.dump(synonyms, 'similar_keywords')

['similar_keywords']

In [36]:
similar_keywords = joblib.load('similar_keywords')

In [None]:
voc_vectors_dict = joblib.load('voc_vectors_dict')

In [41]:
id_keywords = {}
for keyword in keywords.find():
    id_keywords[keyword['word']]=keyword['_id']

for k, v in similar_keywords.items():
    keyword_ids = [id_keywords[kw] for kw in v]
    keywords.update_one({'word': k}, {'$set': {'similar_words': keyword_ids}})

In [30]:
keywords.find().count()

51301

In [14]:
for doc in keywords.find():
    keywords.update_one({'_id': doc['_id']}, {'$set': {'frequency': len(doc['sentenceIDs'])} })

In [7]:
frequencies['islands']

733

In [48]:
for doc in papers.find({'paper': {'$exists': True}})[:5]:
    print(doc['authors'], doc['updated_keywords'])

['Lida Veselovska'] ['czech', 'passives', 'passive', 'vs', 'past_participles', 'czech', 'clitics', 'past', 'czech', 'auxiliary', 'grammatical', 'morphemes', 'instrumental', 'case', 'postsyntactic', 'insertion', 'postsyntactic', 'derivation', 'semantics', 'morphology', 'syntax', 'morphology', 'syntax']
['Lida Veselovska'] ['czech', 'dp', 'universal', 'dp', 'determiners', 'functional', 'domain', 'word_order', 'in', 'dps', 'semantics', 'of', 'dp', 'modifiers', 'of', 'n', 'semantics', 'morphology', 'syntax']
['Philippe Schlenker'] ['sign_language', 'strong', 'pronouns', 'pointing', 'focus', 'semantics', 'morphology', 'syntax']
['Asia Pietraszko'] ['syntax', 'morphology', 'extended_projections', 'selection', 'agreement', 'multiverbal', 'constructions', 'auxiliaries', 'light_verbs', 'ndebele', 'bantu']
['Hadas Kotek', 'Matthew Barros'] ['sluicing', 'ellipsis', 'licensing', 'pairlist', 'readings', 'scope', 'parallelism', 'semantics', 'syntax']


In [51]:
papers = db.get_collection('papers')
sentences = db.get_collection('sentences')

In [66]:
a = [lambda doc: doc['_id'] for doc in sentences.find({}, {'_id': 1}).sort([('score',-1)]).limit(1)]

In [81]:
q = ['wrote', 'syntax']
answer = 'These are some papers you might want to read: \n\n'
for w in q:
    for candidate in papers.find({'updated_keywords': w}):
        answer += candidate['title'] + ', by '+ ', '.join(c for c in candidate['authors']) + '\n'
        answer += 'You can download the paper here: ling.auf.net/' + candidate['url'] + '\n\n'
print(answer)

wrote
syntax
These are some papers you might want to read: 

The clause-mate condition on resumption: Evidence from Kaqchikel, by Yusuke Imanishi
You can download the paper here: ling.auf.net//lingbuzz/003606

The Preliminary Material of 'Natural Language and Possible Minds: How Language Uncovers the Cognitive Landscape of Nature', by Prakash Mondal
You can download the paper here: ling.auf.net//lingbuzz/003607

Analytic Passives in Czech, by Lida Veselovska
You can download the paper here: ling.auf.net//lingbuzz/003608

Stylistic Fronting in corpora, by Halldor Armann Sigurdsson
You can download the paper here: ling.auf.net//lingbuzz/002635

Topicality in Icelandic: Null arguments and Narrative Inversion, by Halldor Armann Sigurdsson
You can download the paper here: ling.auf.net//lingbuzz/003611

The Universal DP Analysis in Articleless Languages: A Case Study in Czech, by Lida Veselovska
You can download the paper here: ling.auf.net//lingbuzz/003609

Proxy control: extending the typo

In [67]:
a

[<function __main__.<listcomp>.<lambda>>]