In [68]:
import os
import pandas as pd
from string import punctuation as punct

import numpy as np
from lxml import html
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
punct += '«»–'

In [15]:
df = pd.read_json('ng_0.jsonlines', lines=True)

In [42]:
corpus_xml = html.fromstring(open('paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
PARA = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [46]:
PARA.label = PARA.label.map({'0': 0, '1': 1, '-1': 2})

In [60]:
PARA['text_norm1'] = PARA.text_1.apply(normalize)
PARA['text_norm2'] = PARA.text_2.apply(normalize)

In [58]:
def normalize(x):
    return x.lower().strip(punct)

In [17]:
DATA = list(df.content.values)

In [31]:
DATA_NORM = [' '.join([normalize(w) for w in x.split() if w.strip(punct)]) for x in DATA]

In [35]:
CVECT = CountVectorizer()
TFIDF = TfidfVectorizer()

In [37]:
CVECT = CVECT.fit(DATA_NORM)
TFIDF = TFIDF.fit(DATA_NORM)

In [34]:
def fit_models(data, vectorizer, models):
    MODELS = dict()
    
    data_vectorized = vectorizer.transform(data)
    data_split = [x.split() for x in data]
    
    for model in models:
        if isinstance(model, TruncatedSVD):
            model.fit(data_vectorized)
            MODELS['SVD'] = model
        
        elif isinstance(model, NMF):
            model.fit(data_vectorized)
            MODELS['NMF'] = model
    
    return MODELS



In [39]:
%%time
MODELS = fit_models(DATA_NORM, TFIDF, [TruncatedSVD(2), NMF(2)])

CPU times: user 5.97 s, sys: 219 ms, total: 6.19 s
Wall time: 3.94 s


In [82]:
def cosine(x, y):
    return np.dot(x, y) / ( np.sqrt(np.sum(x**2)) * np.sqrt(np.sum(y**2)) )

In [92]:
def compute_features(para, model, vectorizer=None, text_norm=True):
    TEXT1 = para.text_norm1 if text_norm else para.text_1
    TEXT2 = para.text_norm2 if text_norm else para.text_2
    
    if vectorizer is not None:
        V1 = vectorizer.transform(TEXT1)
        V2 = vectorizer.transform(TEXT2)
    
    else:
        TEXT1_SPLIT = [x.split() for x in TEXT1]
        TEXT2_SPLIT = [x.split() for x in TEXT2]
    
    if isinstance(model, TruncatedSVD):
        v1 = model.transform(V1)
        v2 = model.transform(V2)
        
        #return [cosine(v1[i], v2[i]) for i in range(v1.shape[0])]

    elif isinstance(model, NMF):
        pass
    
    return np.diag(cosine_similarity(v1, v2))

In [93]:
%%time
X = compute_features(PARA, MODELS['SVD'], CVECT)

CPU times: user 641 ms, sys: 719 ms, total: 1.36 s
Wall time: 1.29 s
