# Sentence similarity

In [1]:
from pymongo import MongoClient
import pandas as pd
from sklearn.externals import joblib
from multiprocessing import Pool
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRow, IndexedRowMatrix
from pyspark import SparkContext
import pandas as pd
import re
import string
from sklearn.externals import joblib

The sentence matrix has over 600K rows. Because of memory limitations, it is impossible to compute cosine similarity the regular way. I will have to partition the computation, using Spark. Even with Spark, I need to increase the default driver memory.

In [2]:
from pyspark import SparkConf, SparkContext
conf = (SparkConf()
     .set("spark.driver.extraJavaOptions", "-Xss4M")
     .set('spark.executor.memory', '4G')
     .set('spark.driver.memory', '55G')
     .set('spark.driver.maxResultSize', '10G'))
sc = SparkContext.getOrCreate(conf = conf)

## With average sentence vectors

In [3]:
#vectors = joblib.load('sentence_vectors')
id_s = joblib.load('sentence_ids')

In [4]:
df = pd.DataFrame({str(k):v for k, v in zip(id_s, vectors)})

In [5]:
#df.shape

In [5]:
rows = sc.parallelize(df.values)
matrix = RowMatrix(rows)

In [7]:
# print(matrix.numCols())
# print(matrix.numRows())

In [6]:
exact = matrix.columnSimilarities()

In [8]:
exact.numCols()

631709

Ok, so even Spark can't handle doing stuff on matrices of this size.  
Options:
* figure out how to inspect this matrix in Spark. What I need is a similarity threshold.
* calculate sentence similarities on subets of sentences. Subsets are sentences with words in common.

In [3]:
client = MongoClient()
db = client.lingbuzz

In [18]:
keywords = db.get_collection('keywords')

In [138]:
for doc in keywords.find()[:2]:
    print(doc)

{'_id': ObjectId('59aea9b2b18b14a4ed174039'), 'word': 'analytic', 'sentenceIDs': [ObjectId('59a8676db18b14085b6c6114'), ObjectId('59a9ec55b18b14085f6d09d9'), ObjectId('59aa9037b18b1408616cda08'), ObjectId('59a85f7cb18b14085d6c5eac'), ObjectId('59aa90aeb18b14085b6d5c45'), ObjectId('59aa9324b18b14085b6d5dff'), ObjectId('59ab3cddb18b1408636caa45'), ObjectId('59ab6726b18b1408616d569c'), ObjectId('59a9ea16b18b14085f6d055b'), ObjectId('59a92491b18b1408646d1307'), ObjectId('59a9ea53b18b14085f6d05e8'), ObjectId('59aac8cdb18b14085d6d183a'), ObjectId('59a97134b18b1408626ce30e'), ObjectId('59a86752b18b14085b6c6107'), ObjectId('59aa279eb18b1408626d1412'), ObjectId('59a990b0b18b14085d6cd9e8'), ObjectId('59aa27b0b18b1408626d142f'), ObjectId('59a87cbab18b1408646c78c4'), ObjectId('59a9a6e7b18b1408606d1b43'), ObjectId('59a9a8ddb18b1408606d1d48'), ObjectId('59a990bab18b14085d6cd9ea'), ObjectId('59a86767b18b14085b6c6111'), ObjectId('59ab42f5b18b14085c6d00fa'), ObjectId('59a8db52b18b14085d6c9c21'), Object

In [4]:
sentences = db.get_collection('sentences')

In [142]:
sentences.find().count()

631676

In [81]:
test_cases = []
for doc in keywords.find()[:2]:
    test_cases.append([id_ for id_ in doc['sentenceIDs']])

In [141]:
test_cases = []
for doc in keywords.find({'word':'focus'}):
    test_cases.append([id_ for id_ in doc['sentenceIDs']])

In [82]:
df = pd.DataFrame({k:v for k, v in zip(id_s, vectors)})

In [83]:
df.head()

Unnamed: 0,59a85acdb18b146ddb84ff2b,59a85aceb18b146ddb84ff2c,59a85aceb18b146ddc84ff2b,59a85acfb18b146dda84ff2b,59a85acfb18b146ddb84ff2d,59a85acfb18b146ddb84ff2e,59a85ad0b18b146ddb84ff2f,59a85ad0b18b146ddc84ff2c,59a85ad1b18b146dda84ff2c,59a85ad1b18b146ddb84ff30,...,59ac7df9b18b1408636d30ff,59ac7dfab18b1408636d3100,59ac7dfbb18b1408636d3101,59ac7dfcb18b1408636d3102,59ac7dfdb18b1408636d3103,59ac7dfeb18b1408636d3104,59ac7dffb18b1408636d3105,59ac7e00b18b1408636d3106,59ac7e01b18b1408636d3107,59ac7e02b18b1408636d3108
0,0.003749,-0.029787,-0.04009,-0.090344,-0.058753,-0.096282,-0.052853,0.023564,-0.017688,-0.073697,...,-0.049024,-0.085572,-0.126967,-0.110327,-0.11313,-0.071988,-0.092915,-0.049901,-0.084928,-0.081315
1,-0.132782,-0.058256,0.035099,-0.044283,-0.060248,-0.001526,-0.111382,-0.027992,-0.076817,-0.050179,...,0.120914,0.038315,-0.021265,-0.017821,-0.069592,-0.028474,0.012315,0.02486,0.005246,0.190046
2,-0.03316,-0.062912,-0.043438,-0.115031,-0.100944,-0.102565,-0.120707,-0.057257,-0.064446,-0.061498,...,-0.211657,-0.125537,-0.137405,-0.152097,-0.1315,-0.159043,-0.12608,-0.097961,-0.081945,-0.234975
3,0.089882,0.198107,0.140203,0.178266,0.224087,0.188279,0.198009,0.102116,0.267208,0.180776,...,0.139599,0.055086,0.124506,0.092474,0.048542,0.05107,0.23065,0.192495,0.171357,0.21002
4,-0.180051,-0.162674,-0.065769,-0.006834,-0.104424,-0.109378,-0.07469,-0.054721,-0.093111,-0.087788,...,-0.079781,-0.0694,-0.058689,-0.071539,-0.065976,-0.04278,-0.061852,-0.064228,-0.125309,-0.224094


In [119]:
test_cases[0]

[ObjectId('59ab1d7cb18b1408616d3833'),
 ObjectId('59a8ba5eb18b1408626c9561'),
 ObjectId('59a916a5b18b1408606cc0c5'),
 ObjectId('59a9a806b18b14085c6c80ed'),
 ObjectId('59a9e723b18b14085b6d2964'),
 ObjectId('59a8ec1db18b14085f6cb5f0'),
 ObjectId('59a9a4e8b18b14085c6c80ad'),
 ObjectId('59a8fe5ab18b14085c6c73e7'),
 ObjectId('59a95ce9b18b14085d6ccd5a'),
 ObjectId('59abe735b18b1408636d02c0'),
 ObjectId('59aa0a2ab18b14085b6d417d'),
 ObjectId('59abe967b18b1408636d030f'),
 ObjectId('59a94021b18b1408606ceae0'),
 ObjectId('59a8dde5b18b1408646cd92b'),
 ObjectId('59ac190db18b1408636d09d4'),
 ObjectId('59a87a27b18b1408646c76d6'),
 ObjectId('59aa7505b18b14085c6c9853'),
 ObjectId('59a8e1ffb18b1408606c9871'),
 ObjectId('59a8c8e7b18b14085b6c7d58'),
 ObjectId('59aa5f1fb18b14085c6c9315'),
 ObjectId('59ab40d5b18b1408616d4610'),
 ObjectId('59a86b57b18b1408616c6919'),
 ObjectId('59a92beab18b14085c6c773a'),
 ObjectId('59ac4a0db18b1408636d10e1'),
 ObjectId('59a8f677b18b14085f6cb7a1'),
 ObjectId('59a88b03b18b14

In [120]:
df_test1 = df[test_cases[0]]

In [28]:
def create_df_cs(vectors, ids):
    """calculates relative cosine distance between two sentences and returns df with sentenceids and their distance"""
    cos_sim = cosine_similarity(np.asarray(vectors))
    df = pd.DataFrame(cos_sim, index = ids, columns = ids)
    return df

In [121]:
df_test1.shape

(300, 7287)

In [122]:
cs_test1 = create_df_cs(df_test1.values.T, list(df_test1))

In [125]:
cs_test1.shape

(7287, 7287)

In [126]:
sorted(list(cs_test1.iloc[1]), reverse=True)

[0.99999999999999978,
 0.94115676124243741,
 0.93975032437512174,
 0.93778992934703109,
 0.93730012821538589,
 0.93702055761230985,
 0.93544038373675997,
 0.93526386297255004,
 0.93497472471654697,
 0.93417396665109886,
 0.93324933369750762,
 0.93244398474427759,
 0.93132665435673789,
 0.93108138935256468,
 0.93093515318938158,
 0.92954735011747425,
 0.92946273922272149,
 0.92946258913316937,
 0.92941135769567074,
 0.92929266093425933,
 0.92915403051995837,
 0.92888847109047967,
 0.92877954355696146,
 0.92828290666501201,
 0.92816326077292866,
 0.92802498746470663,
 0.92761554493589227,
 0.92760331526587569,
 0.92747888838572257,
 0.92734510556247285,
 0.92685305276107488,
 0.92670190824020138,
 0.9266764220330449,
 0.92634036023903543,
 0.92625077872089068,
 0.92604859226252167,
 0.9257351189997004,
 0.92572583868527836,
 0.92562996870910563,
 0.92556347998142141,
 0.92550737771870073,
 0.92534234758267697,
 0.92504290377882747,
 0.92495843732065697,
 0.92495218227299236,
 0.924937324

In [134]:
similar_sent = cs_test1[cs_test1.iloc[:,0] > 0.93]

In [135]:
similar_sent.head()

Unnamed: 0,59ab1d7cb18b1408616d3833,59a8ba5eb18b1408626c9561,59a916a5b18b1408606cc0c5,59a9a806b18b14085c6c80ed,59a9e723b18b14085b6d2964,59a8ec1db18b14085f6cb5f0,59a9a4e8b18b14085c6c80ad,59a8fe5ab18b14085c6c73e7,59a95ce9b18b14085d6ccd5a,59abe735b18b1408636d02c0,...,59ab5d3fb18b14085c6d23bf,59a8a848b18b14085c6c6da6,59ac21b2b18b1408636d0b0e,59a8c4ceb18b1408616c8051,59a9735bb18b1408626ce5d4,59a97573b18b1408626ce78e,59ac071ab18b1408636d0741,59ab4977b18b14085c6d0874,59a9820bb18b14085c6c7dff,59ab8f67b18b1408616d5cb8
59ab1d7cb18b1408616d3833,1.0,0.891731,0.810523,0.894205,0.720729,0.836695,0.792145,0.888333,0.896683,0.821673,...,0.897806,0.865796,0.882149,0.850376,0.872939,0.843214,0.831936,0.541229,0.871662,0.879649
59a88c67b18b14085f6c8a05,0.930264,0.904257,0.792026,0.919981,0.749302,0.899114,0.843609,0.912083,0.939487,0.871797,...,0.930611,0.909548,0.917866,0.863308,0.928727,0.900189,0.86696,0.581405,0.898551,0.883359
59aaf167b18b14085e6d0aaf,0.930975,0.911427,0.824114,0.914294,0.742569,0.850231,0.808511,0.902911,0.90775,0.857797,...,0.92836,0.886061,0.89262,0.882964,0.891682,0.874243,0.866444,0.545621,0.886993,0.903368


In [136]:
similar_sent_indexes = list(similar_sent.index)
similarity_score = similar_sent.iloc[:,0]
id_to_score = {k:v for k, v in zip(similar_sent_indexes, similarity_score)}

In [137]:
for sent in sentences.find({'_id': { '$in':  similar_sent_indexes }}):
    print(sent['_id'],id_to_score[sent['_id']], '\n', sent['sentence'], '\n\n')

59a88c67b18b14085f6c8a05 0.930264098443 
 Thus again we have a situation where the focus can be varied within the clause,  but the form of the verb does not vary accordingly as predicted.    Because  of  these  problems  we  are  led  to  abandon  the  Verb  Focus  Hypothesis, just as we abandoned the Postverbal Term Focus Hypothesis. We  are  thus  left  with  the  correlation  between  the  conjoint/disjoint  alternation  and  syntactic  constituency  established  earlier,  but  with  no  direct  correlation  with  focus.    4    


59aaf167b18b14085e6d0aaf 0.930975283016 
 In so doing, I focus on four empirical domains, namely restrictions on the subject,  there-expletives, object clitics and the use of the third person singular neuter pronoun 't  'it'.261 In the final section, I point out how these findings strengthen the hypothesis that  conjugated instances of 'yes' and 'no' are related to SDRs.     


59ab1d7cb18b1408616d3833 1.0 
 Recall that I focus here on adjectival resultat

## With idf weighted sentence vectors

The similar sentences aren't really similar to each other.   
__Different way of calculating sentence vectors:__ normalize the word vectors weighted with idf of words in my own corpus. 

In [14]:
def is_english_sentence(sent):
    """determines whether a word is English/author"""
    sentence = []
    for w in str(sent).split():
        w = str(w).lower()
        if w in authors:
            sentence.append(w)
        else: 
            try:
                w.encode(encoding='utf-8').decode('ascii')
                    # if re.sub('-', '', word).isalpha():
                        # english_words.append(re.sub('[%s]' % re.escape(string.punctuation), '', word))
                word = re.sub('[%s]' % re.escape(string.punctuation), '', w)
                if word.isalpha():
                    sentence.append(word)
            except UnicodeDecodeError:
                pass
    return sentence

def calculate_sentence_vector(words, word_to_vec, num_features = 300):
    featureVec = np.zeros((num_features,), dtype="float32")
    N = 631676
    for word in words:
        if word in word_to_vec:
            idf = np.log((1+N)/(1+len(keywords.find_one({'word':word})['sentenceIDs'])))
            featureVec = np.add(featureVec, word_to_vec[word]['vector']*idf)
    return featureVec


# voc_vectors = {}
# with open('voc_vectors.txt', 'rb') as f:
#     content = f.readlines()
# 
# for line in content:
#     line = line.decode("utf-8").split(" ", 1)
#     voc_vectors[line[0]] = {'vector': np.fromstring(line[1].strip(), sep=' ')}
#     
# authors = joblib.load('authors')
# bigrams = joblib.load('bigrams_model')
# 
# # udpate word vectors dict with sentence IDs and calculate sentence vectors
# vectors = []
# for sent in sentences.find():
#     sentence = bigrams[is_english_sentence(sent['sentence'].split())]
#     vectors.append(calculate_sentence_vector(sentence, voc_vectors))
# joblib.dump(vectors, 'sentence_vectors_idf')

In [None]:
df = pd.DataFrame({k:v for k, v in zip(id_s, vectors)})

In [None]:
def inspect_sentence_similarity(word):
    test_cases = []
    for doc in keywords.find({'word':word}):
        test_cases.append([id_ for id_ in doc['sentenceIDs']])
    df_test1 = df[test_cases[0]]    
    cs_test1 = create_df_cs(df_test1.values.T, list(df_test1))
    similar_sent = cs_test1[cs_test1.iloc[:,0] > 0.93]
    similar_sent_indexes = list(similar_sent.index)
    similarity_score = similar_sent.iloc[:,0]
    id_to_score = {k:v for k, v in zip(similar_sent_indexes, similarity_score)}
    for sent in sentences.find({'_id': { '$in':  similar_sent_indexes }}):
        print(sent['_id'],id_to_score[sent['_id']], '\n', sent['sentence'], '\n\n')

## With self-trained word2vec vectors

In [25]:
word2vec = joblib.load('word2vec')

In [12]:
bigrams = joblib.load('bigrams_model')

In [16]:
authors = joblib.load('authors')

In [6]:
def avg_feature_vector(words, word_to_vec, id_, num_features = 300):
    """words is list of words, num_features in dimension of vector, word_to_vec is dict with word:vector
    appends sentence ids to word_to_vec so we can quickly recover which words are in which sentences
    returns average feature vector for the sentence"""
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    for word in words:
        if word in word_to_vec:
            nwords = nwords+1
            featureVec = np.add(featureVec, word_to_vec[word]['vector'])
            word_to_vec[word]['sentenceIDs'].append(id_)
    if(nwords>0):
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [None]:
vectors = []
for sent in sentences.find():
    sentence = bigrams[is_english_sentence(sent['sentence'].split())]
    vectors.append(calculate_sentence_vector(sentence, word2vec))

## With sentence vectors calculated by fastText:

In [44]:
with open('sentences_for_ft.txt', 'wb') as f:
    for sent in sentences.find()[:2000]:
        print(sent['sentence'])
        f.write(' '.join(is_english_sentence(sent['sentence'])).encode("utf-8"))
        f.write('\n'.encode("utf-8"))

To Appear in H. Harley ed.
The Proceedings of the Penn/MIT Workshop on Aspect, Argument  Structure, and Events, May 1997, MITWPL  Voice Systems and the Syntax/Morphology Interface   David Embick, University of Pennsylvania/MIT  1  
Polish Stress: looking for phonetic evidence of a bidirectional system   Luiza Newlin- Łukowicz   New York University   luiza.lukowicz@nyu.edu      Note: This is a pre-publication manuscript.
ANALYTIC PASSIVES IN CZECH    Ludmila Veselovská & Petr Karlík   INTRODUCTION: DEFINING THE PROBLEM   Like other languages, Czech also has pairs of sentences clearly related both in their  form and in their meaning.
Introduction  The topic of this paper is the manner in which syntactic voice alternations (and syntactic conﬁgurations more generally) relate to voice morphology, and the implications of this for theories of voice and theories of morphology/syntax in- teractions.
I argue based on the voice system of Modern Greek for three primary points: ﬁrst, that treatment

In the spirit of Katz and Postal (1964, 133-134), Kayne (2004) proposed that sentences like:    (1)  John went there.
203-219, Current Research in the Semantics/Pragmatics Interface 32, Brill.   
Furthermore, there are instances of what looks like the indeﬁnite article (e.g. German ein or its Dutch etc.
In this paper I show that on closer inspection, implementing a carto- graphic approach to clausal architecture as a derivational minimalist analysis proves to be problematic.
Such voice mismatches are not allowed in any other kind of ellipsis, such as sluicing and other clausal ellipses.
Abstract:    It  is  well-known  that  untyped  systems  with  property-denoting  expressions  that  can  appear  in  argument and predicate positions are prone to paradox.
counterpart) which occur in contexts from which the indeﬁnite article is supposed to be excluded: with plural nouns, with non-count nouns, in deﬁnite noun phrases, etc.
contain an unpronounced noun corresponding to overt place.  
The

But from the perspective of theoretical linguistics, it stands in the way of a better understanding of the nature of, e.g., German ein.
This naïve theory of properties has many virtues, but it seems to have been shattered by (the property version  of)  Russell’s  paradox.  ‘
Gwi=hl  what=CNC  gub-i=s eat-TR=DNC  Lisa?
the nature of language’ in some sense, and ultimately understand what  those other reasons are.  
with overt reason.  
In  other  words,  how  does  syntax  know  such  is  (or   must be) the case?                         
2.1 einem: ein+em Consider (1).
Seems  to’  have  been  shattered?  
They are dealt with within the framework of Distributed Morphology.
I  provide  evidence  that  the  null  wh’s,  like  the  overt  wh’s,  move  successive  cyclically  and  may  trigger  agreement  on  intermediate  complementizers  that  occur  in  the  movement  pathway.    
(The unpronounced noun present with thereby may be WAY.)      
Lisa  ‘
The aim of this paper is two-fold.
mit

bear  ‘
One possible   answer  to  this  question  might  be  to  assume  that  syntax  ‘
But fortunately, I can safely assume that Ede Zimmermann will find  them all, and will probably go on to correct them too, as he regularly has in the past.    
(3)  a.  b.  c.  Ein Hund a dog (*Eine) (a) (*Ein) (a)  hat has Hunde dogs Wissen knowledge  mich me haben have ist is  angebellt.
In this sense, to (informally) call there and where (and here) ‘
It is in this light that the contrasts between voice mismatches in varying kinds of ellipses loom large.
somehow’  knows  that  the   derivation will crash (or will be interpreted as gibberish) if it makes other choices than V   since  they  will  all  eventually  create  a  structure  that  would  lead  to  a  violation  of  the   Extension Condition defined as in (3):      Extension Condition (Chomsky 1995)   (3)             Merge must extend the root of the structure it applies to.      
Lisa saw the bear’  b. Naa who  an=t AN=3.I  gya’a=hl see=

The proposal in this section is that the indeﬁnite article does not exist.3 What exists is a set of zero- operators which constitute a subset of operators that occur to the left of ein.
Nominalization is also a fairly common ‘
(3)  because  merging  V  with  the  head  v*  does  not   extend  the  root  of  the  previously-generated  structure.  
Consider the examples in (5).4 (5)  ‘no book’  b.  a.  m-ein d-ein s-ein welch which  Buch Buch Buch ein a  Buch Buch  book’ ‘my book’ ‘your ‘his/her book’ ‘what a book’  c.  k-ein no  Buch book  d. was what  f¨ur for  ein a  Buch book  ‘what kind of book’  The element ein can be preceded by a person element, as in (5a), which has referential properties and is associated with possessor semantics.
where vs. place.      
2011).
Individual-denoting Quantifiers and Pronouns   2.1  Quine famously argued that 'to be is to be the value of a variable bound by an existential quantifier'.  
Assuming,  however,  that  syntax   thus  somehow  knows  ‘in  

I will try and answer one aspect of that question, by looking at other occurrences of ein.
First, I assume a group of features of a head H distinct from the rest of   the  features  of  it,  calling  the  former  ‘selectional  features’  of  H.  To  be  more  specific,   selectional features of H include: 1) features for (thematic) argument(s) that H takes and   2)  features  for  another  head  that  H  subcategorizes  for.5  
Section 5 oﬀers a solution for the problem of complementisers  preceding left-peripheral material.  2  2  
The main theoretical implication is that apparent extraction asymmetries in head-marking languages cannot be straightforwardly analysed as wh-agreement (contra Deal (2016)).  
The  fact that here and mi- share the status of being ‘the odd man out’ in turn recalls the fact that here has, interpretively  speaking, something in common with first person, and suggests a possible link between the indefinite wh- and the s-  morpheme of Romance, which might then b

A representative claim is that of Sag 1976:17, who give the following examples with the judgments indicated.  
If  we  consider, however, the pivotal roles  of phase heads in current minimalism as the initiator of NS operations (e.g. Feature- Inheritance, Transfer), the idea is not much of a stipulation.    8  The  idea  behind  the  Summoning  Condition  is  not  identical  but  similar  to  that  of  Feature-Inheritance  (Chomsky  2007,  2008) where T, which is not an inherent probe, is assumed to be able to function as a probe only after it inherits φ-features  from C.      9.  
This  piece  of  trickery  will  come  in  handy  in  our  discussion of property talk, where we won't be able to appeal to reflexive pronouns to circumvent  Condition B.     2.2  Property-denoting Quantifiers and Pronouns  Let us now turn to properties.
3SG.III  (6) Dependent Nee=dii=n
Thus Czech linguistics in  the second half of the 20th century approaches the topic accentuating the functional  aspects of

2.2 High/Big ellipses:
In  the  same  vein,  V  can  undergo  EM  with   DMary because V has its own unsatisfied selectional feature, whereas DMary does not bear   any.15  
This is reflected, I think, in the fact that the following are (to me) less natural  with someplace than with somewhere (it may be that else would obscure the difference here):    (27)  They must have hidden it somewhere/?someplace.    (28)  You’ve got to take your vacation somewhere/?someplace, after all As an initial approximation, let us say that somewhere is less specific than someplace.      
Section  10  presents  conclusions and open issues for future research.     2.    
Furthermore, in this case as well different occurrences of the same property-denoting variable may  occur  in predicate and in argument positions:  (10)  a. Ann is both intelligent and hard-working, which1 is useful but which1 unfortunately her daughter isn't.    
Before I turn to the Gitksan data I will provide a quick background on ergativ

Although it is unlikely that  the  choice  (or  order)  between  v*  and  C  will  make  any  difference,  I  will  limit  my  discussion to  the  v*P-domain  because  space does not permit us to discuss it here.    14.
There was unity, simplicity and some sort of rationality underlying the seemingly  unconstrained surface variation.    
Therefore:  This inference has the following form (we represent this by a free variable or a constant y):  (13)  a. g(y)   b. There is something1 [, namely green,] which1 each thing2 is if and only if it2 is green.   
notes that in French pas un NP is the subject counterpart of object pas de NP.
(32)  *?
decl’. Optimally, Forceo agrees with [wh] before it satisﬁes the feature speciﬁcation of the higher verb; otherwise, sentence type marking would take place independently of the observed wh-movement.
The man who took the gun.’  (West Greenlandic, Bittner 1994)  To express the intended meaning in (13), an antipassive sufﬁx -si must appear on the predicat

This contrast within non-standard English recalls a more familiar one concerning adjectives in standard English:  36ff.). 11    Cf.
A possible alternative is to assume that Force has no sentence type information, and that the C domain is transparent in a way that a higher selector can agree with the relevant sentence  type feature across other left-peripheral material, obeying minimality.
References Baunaz, Lena.
Dryer showed that some of Greenberg’s universals do hold up (as statistical  universals), under these more stringent conditions, others do not, but are shown to be areal effects,  not detectable in Greenberg’s small sample.  
Deal extends case discrimination to syntactic ergativity/ ¯A-movement in this way: (i) ¯A-movement  of XP to Spec,CP requires Agree between XP and the C head in an operator feature — [WH], [REL], or [FOC], (ii) the operations Agree-[WH], Agree-[REL], and Agree-[FOC] are case discriminating: elements in dependent and lexical/oblique case are not accessible

9The extraction of indirect objects and adjuncts triggers a cleft-like construction involving the complementizer wil/win, which, like agent extraction, triggers a dependent clause (Davis and Brown 2011).  
This has meant that the grammatical properties that are  investigated/compared, are by necessity all easily observable ‘
To tackle this timing  dilemma,   he  argues  that  valuation  must  be  part  of Transfer  (Premise  1).  
Cambridge: MIT Press.
Throughout, I refer to the different noun classes by the form of the proximal definite article.  
This apple is something1 which1 is identical to something2 which2 it1 isn't.    
They went everywheres else.      
Gitksan has largely been overlooked.
Before developing the argument, let me brieﬂy explore  how syntactic relevance is deﬁned for a given feature.
b. *Lilies are brought by some and others roses.  
Haegeman, Liliane.
Since there is an ambiguity in English be, which could be understood as 'identity be' or 'predicative  be', we a

The  advantage of this reductive procedure is that it is straightforward to construct models for a language  that contains its own satisfaction predicate.
Celtic is perhaps a clearer example than Germanic or Slavic, both of which  have a time depth considerably less than 3500 years”.
Importantly, the trivial formalisation of T1 in  (3) is not able to capture the process in  which the active variant  does not contain a transitive verb and some kind of process  of passivisation still takes place.
18    English often writes sometime as one word, though it (for me) does not share the behavior of  someplace, given someplace else vs. *sometime else.      
The dog (near)’                 ‘
examined  ihn him  wurde was hat has  (13) Nonelliptical counterparts to gapping  a. Some bring roses and lilies are brought by others.
Syntax 11:1–25.
In the next section I discuss what an might be, and why it is showing up in these two constructions.  4 Why is some movement illicit, and what is an?  
But 

In [47]:
with open('sentence_vecs_ft.txt', 'rb') as f:
    vecs_ft = f.readlines()

In [50]:
vecs_ft[0]

b'to appear in h harley ed 0.00091069 -0.031988 -0.0093131 0.030825 -0.050137 -0.011179 0.017665 -0.065541 -0.020922 0.092864 0.0145 0.0046519 0.013968 -0.010203 -0.0090246 -0.057939 -0.0038059 0.023035 -0.015181 0.014366 -0.015606 0.059225 -0.045705 -0.029831 -0.015993 -0.010342 0.019091 -0.018952 -0.002487 0.043615 -0.013636 0.09237 -0.040965 0.06406 0.002343 -0.051998 -0.013492 -0.012124 0.020477 0.019623 0.015607 -0.032985 -0.047967 0.033901 0.0033726 -0.016314 0.03216 -0.025047 0.0081658 -0.015947 0.0068944 -0.045445 0.02953 0.0007294 -0.058982 0.038561 0.039426 0.023136 -0.0062196 0.048536 0.041632 -0.049816 0.062894 0.013915 0.0095622 -0.025751 -0.027594 -0.016293 -0.064508 0.015596 -0.02551 0.019528 0.042635 -0.040491 -0.0056161 0.0049826 0.059544 0.037459 -0.020611 -0.0090099 0.013277 0.018367 -0.026042 -0.03081 -0.0071219 0.0091794 -0.0073045 -0.035404 -0.033743 -0.032654 -0.039046 0.0096159 0.032295 -0.026135 0.026623 -0.013982 0.01108 0.0039216 -0.021254 -0.0098182 0.014456

In [101]:
featureVec = []
for vec in vecs_ft:
    vec = re.sub('(?<!\d)[^\s0-9.-]', '', str(vec)).strip().split()
    vec_final = []
    for v in vec:
        try:
            vec_final.append(float(v))
        except:
            pass
    featureVec.append(np.array(vec_final))
featureVec = np.array(featureVec)

In [102]:
featureVec.shape

(2000, 300)

In [103]:
def create_df_rel_cs(vectors, ids):
    """calculates relative cosine distance between two sentences and returns df with sentenceids and their distance"""
    cos_sim = cosine_similarity(np.asarray(vectors))
    # sum_cs = np.sum(cos_sim, 1)[0]
    # rel_cs = cos_sim / sum_cs
    df = pd.DataFrame(cos_sim, index = ids, columns = ids)
    return df

In [104]:
df = create_df_rel_cs(featureVec, id_s[:2000])

In [105]:
df.head()

Unnamed: 0,59a85acdb18b146ddb84ff2b,59a85aceb18b146ddb84ff2c,59a85aceb18b146ddc84ff2b,59a85acfb18b146dda84ff2b,59a85acfb18b146ddb84ff2d,59a85acfb18b146ddb84ff2e,59a85ad0b18b146ddc84ff2c,59a85ad0b18b146ddb84ff2f,59a85ad1b18b146ddb84ff30,59a85ad1b18b146dda84ff2c,...,59a85f43b18b1408616c5d57,59a85f43b18b1408646c5dff,59a85f43b18b1408626c5d83,59a85f43b18b1408606c5d87,59a85f43b18b14085e6c5d98,59a85f44b18b14085d6c5e45,59a85f44b18b14085f6c5e12,59a85f44b18b1408646c5e00,59a85f44b18b1408636c5df2,59a85f44b18b14085e6c5d99
59a85acdb18b146ddb84ff2b,1.0,0.706784,0.695822,0.70714,0.712469,0.725256,0.604472,0.683565,0.708998,0.626079,...,0.694904,0.55667,0.639916,0.72557,0.733691,0.723354,0.705326,0.493257,0.677109,0.672588
59a85aceb18b146ddb84ff2c,0.706784,1.0,0.865374,0.863174,0.928125,0.908253,0.710484,0.910315,0.886147,0.880903,...,0.880518,0.74469,0.836193,0.897635,0.889918,0.88849,0.89072,0.419284,0.884308,0.722054
59a85aceb18b146ddc84ff2b,0.695822,0.865374,1.0,0.838591,0.866213,0.895829,0.79408,0.806598,0.846118,0.812577,...,0.871774,0.68494,0.774827,0.886633,0.882811,0.864853,0.872054,0.418448,0.82685,0.778491
59a85acfb18b146dda84ff2b,0.70714,0.863174,0.838591,1.0,0.912881,0.917702,0.720567,0.904928,0.88475,0.842693,...,0.857786,0.643765,0.847546,0.901601,0.898338,0.916486,0.893284,0.359778,0.889997,0.779054
59a85acfb18b146ddb84ff2d,0.712469,0.928125,0.866213,0.912881,1.0,0.958465,0.752579,0.936764,0.930308,0.921224,...,0.893524,0.71611,0.866366,0.9278,0.916798,0.930342,0.918746,0.36026,0.920086,0.77677


In [133]:
similar_sent = df[(df.iloc[:,4] < 1.1) & (df.iloc[:,4] > 0.95)]
similar_sent_indexes = list(similar_sent.index)
similarity_score = similar_sent.iloc[:,4]
id_to_score = {k:v for k, v in zip(similar_sent_indexes, similarity_score)}
for sent in sentences.find({'_id': { '$in':  similar_sent_indexes }}):
    print(sent['_id'],id_to_score[sent['_id']], '\n', sent['sentence'], '\n\n')

59a85acfb18b146ddb84ff2d 1.0 
 Introduction  The topic of this paper is the manner in which syntactic voice alternations (and syntactic conﬁgurations more generally) relate to voice morphology, and the implications of this for theories of voice and theories of morphology/syntax in- teractions. 


59a85acfb18b146ddb84ff2e 0.958464975464 
 I argue based on the voice system of Modern Greek for three primary points: ﬁrst, that treatments taking voice morphology as corresponding to a syn- tactic argument of the verb (see references below) will not work for voice systems like Greek; second, that voice morphology in Greek must be analyzed as related to a morphological feature which is added post-syntactically in speciﬁc syntactic conﬁgurations; and, third, that this analysis has consequences for the study of syn- tax/morphology interactions more generally. 


59a85adab18b146ddb84ff3b 0.95460879196 
 Beginning with the question of how non-active voice  be treated as making reference to a morph

The results are better. The sentence that is most similar to the base sentence is basically an augmented version of the base sentence. We do get sentences with the same PC. However, the gap between the most similar sentence and the other ones (which are not similar at all), is way too small.  
Possible solutions: 
* compare sentence similarities maintaining only nouns, lexical verbs and adjectives. Maybe make a set of them.
* find another way of teasing apart the similarities.

In [137]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [172]:
def is_meaningfull(sent):
    """retains nouns, verbs, adjectives and authors only"""
    sentence = []
    for w in nlp(sent):
        if str(w.lower_) in authors:
            sentence.append(str(w.lower_))
        if w.pos_ in ['VERB' , 'ADJ' , 'NOUN']:
            try:
                str(w).encode(encoding='utf-8').decode('ascii')
                    # if re.sub('-', '', word).isalpha():
                        # english_words.append(re.sub('[%s]' % re.escape(string.punctuation), '', word))
                word = re.sub('[%s]' % re.escape(string.punctuation), '', str(w.lower_))
                if word.isalpha():
                    sentence.append(word)
            except UnicodeDecodeError:
                pass
    return sentence

In [173]:
with open('sentences_for_ft.txt', 'wb') as f:
    for sent in sentences.find():
        f.write(' '.join(is_meaningfull(sent['sentence'])).encode("utf-8"))
        f.write('\n'.encode("utf-8"))

In [4]:
with open('sentence_vecs_ft.txt', 'rb') as f:
    vecs_ft = f.readlines()

In [5]:
featureVec = []
for vec in vecs_ft:
    vec = re.sub('(?<!\d)[^\s0-9.-]', '', str(vec)).strip().split()
    vec_final = []
    for v in vec:
        try:
            vec_final.append(float(v))
        except:
            pass
    featureVec.append(np.array(vec_final))
featureVec = np.array(featureVec)

In [156]:
df = create_df_rel_cs(featureVec, id_s[:2000])

In [162]:
df.head(6)

Unnamed: 0,59a85acdb18b146ddb84ff2b,59a85aceb18b146ddb84ff2c,59a85aceb18b146ddc84ff2b,59a85acfb18b146dda84ff2b,59a85acfb18b146ddb84ff2d,59a85acfb18b146ddb84ff2e,59a85ad0b18b146ddc84ff2c,59a85ad0b18b146ddb84ff2f,59a85ad1b18b146ddb84ff30,59a85ad1b18b146dda84ff2c,...,59a85f43b18b1408616c5d57,59a85f43b18b1408646c5dff,59a85f43b18b1408626c5d83,59a85f43b18b1408606c5d87,59a85f43b18b14085e6c5d98,59a85f44b18b14085d6c5e45,59a85f44b18b14085f6c5e12,59a85f44b18b1408646c5e00,59a85f44b18b1408636c5df2,59a85f44b18b14085e6c5d99
59a85acdb18b146ddb84ff2b,1.0,0.36272,0.428012,0.341371,0.410288,0.442421,0.322088,0.409449,0.415485,0.389737,...,0.356865,0.130191,0.332139,0.390515,0.442029,0.425499,0.45804,0.0,0.402155,0.394705
59a85aceb18b146ddb84ff2c,0.36272,1.0,0.662042,0.532051,0.72631,0.705784,0.466389,0.63033,0.617331,0.638661,...,0.51083,0.245273,0.512479,0.60454,0.625589,0.650886,0.63178,0.0,0.70056,0.433699
59a85aceb18b146ddc84ff2b,0.428012,0.662042,1.0,0.680434,0.771045,0.831076,0.635428,0.692476,0.735275,0.746095,...,0.722995,0.252603,0.703289,0.783689,0.796741,0.78354,0.825546,0.0,0.760937,0.675423
59a85acfb18b146dda84ff2b,0.341371,0.532051,0.680434,1.0,0.728967,0.774742,0.472388,0.74049,0.690191,0.682485,...,0.661517,0.291157,0.724794,0.753531,0.774335,0.769343,0.770693,0.0,0.723366,0.659675
59a85acfb18b146ddb84ff2d,0.410288,0.72631,0.771045,0.728967,1.0,0.931067,0.610977,0.814192,0.783892,0.828405,...,0.690288,0.279949,0.712068,0.826652,0.828507,0.821307,0.792163,0.0,0.843543,0.628478
59a85acfb18b146ddb84ff2e,0.442421,0.705784,0.831076,0.774742,0.931067,1.0,0.62892,0.829301,0.845492,0.827963,...,0.767241,0.276751,0.757154,0.874204,0.895057,0.865071,0.839864,0.0,0.879903,0.721663


In [171]:
similar_sent = df[(df.iloc[:,4] < 1.1) & (df.iloc[:,4] > 0.80)]
similar_sent_indexes = list(similar_sent.index)
similarity_score = similar_sent.iloc[:,4]
id_to_score = {k:v for k, v in zip(similar_sent_indexes, similarity_score)}
for sent in sentences.find({'_id': { '$in':  similar_sent_indexes }}):
    print(sent['_id'],id_to_score[sent['_id']], '\n', sent['sentence'], '\n\n')

59a85acfb18b146ddb84ff2d 1.0 
 Introduction  The topic of this paper is the manner in which syntactic voice alternations (and syntactic conﬁgurations more generally) relate to voice morphology, and the implications of this for theories of voice and theories of morphology/syntax in- teractions. 


59a85acfb18b146ddb84ff2e 0.931067231871 
 I argue based on the voice system of Modern Greek for three primary points: ﬁrst, that treatments taking voice morphology as corresponding to a syn- tactic argument of the verb (see references below) will not work for voice systems like Greek; second, that voice morphology in Greek must be analyzed as related to a morphological feature which is added post-syntactically in speciﬁc syntactic conﬁgurations; and, third, that this analysis has consequences for the study of syn- tax/morphology interactions more generally. 


59a85ad0b18b146ddb84ff2f 0.814191864542 
 These points and their implications are discussed within the context of the theory of Distrib

Ok, this makes much more sense. Now I can actually set a threshold. Let's do this for the whole corpus, and set the threshold to 0.92.

In [9]:
joblib.dump(featureVec, 'featurevec')

['featurevec']