In [None]:
import torch.nn as nn
import torch
import pandas as pd
import numpy as np
import os, sys
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from scipy.spatial import distance
import re
from itertools import product
from scipy.spatial.distance import euclidean

import scipy
import seaborn as sns
from collections import defaultdict

import gensim
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import pulp

In [None]:
df = pd.read_csv()

In [None]:
data = list(df[''])

# Jackard Similarity
J = No of Common words / No of unique words

In [None]:
def jaccard_similarity(new, df):
    len_ = []
    for i in range(len(df)):
        intersection = set(new).intersection(set(df[i]))
        union = set(new).union(set(df[i]))
        len_.append(len(intersection)/len(union))
    return max(len_), df[len_.index(max(len_))]

In [None]:
jaccard_similarity(sentence, data)

# Cosine Similarity

Jaccard similarity takes only unique set of words for each sentence / document while cosine similarity takes total length of the vectors.

In [None]:
def cosine_similarity(new, df):
    X_list = word_tokenize(new)
    len_ = []
    for i in range(len(df)):
        Y_list = word_tokenize(df[i]) 
        sw = stopwords.words('english')  
        l1, l2 = [], [] 
        X_set = {w for w in X_list if not w in sw}  
        Y_set = {w for w in Y_list if not w in sw} 
        rvector = X_set.union(Y_set)  
        for w in rvector: 
            if w in X_set: l1.append(1) 
            else: l1.append(0) 
            if w in Y_set: l2.append(1) 
            else: l2.append(0) 
        c = 0
        for i in range(len(rvector)): 
                c+= l1[i]*l2[i] 
        cosine = c / float((sum(l1)*sum(l2))**0.5) 
        len_.append(cosine)
    return max(len_), df[len_.index(max(len_))]

In [None]:
cosine_similarity(sentence, data)

In [None]:
def cosine_distance_countvectorizer_method(s1, df):
    len_ = []
    for i in range(len(df)):
        allsentences = [s1 , df[i]]
        vectorizer = CountVectorizer()
        all_sentences_to_vector = vectorizer.fit_transform(allsentences)
        text_to_vector_v1 = all_sentences_to_vector.toarray()[0].tolist()
        text_to_vector_v2 = all_sentences_to_vector.toarray()[1].tolist()
        cosine = distance.cosine(text_to_vector_v1, text_to_vector_v2)
        len_.append(round((1-cosine)*100,2))
    return max(len_), df[len_.index(max(len_))]

In [None]:

cosine_distance_countvectorizer_method(sentence, data)

# With GloVe

In [None]:
gloveFile = "../data/glove.6B.50d.txt"
def loadGloveModel(gloveFile):
    with open(gloveFile, encoding="utf8" ) as f:
        content = f.readlines()
    model = {}
    for line in content:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model
model = loadGloveModel(gloveFile)

In [None]:
def preprocess(raw_text):
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)
    words = letters_only_text.lower().split()
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))
    return cleaned_words

def cosine_distance_between_two_words(word1, word2):
    return (1- scipy.spatial.distance.cosine(model[word1], model[word2]))

def cosine_distance_wordembedding_method(s1, s2):
    len_ = []
    vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
    for i in range(len(s2)):
        try:
            vector_2 = np.mean([model[word] for word in preprocess(s2[i])],axis=0)
            cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
            len_.append(round((1-cosine)*100,2))
        except:
            pass
    return max(len_), s2[len_.index(max(len_))]

In [None]:
cosine_distance_wordembedding_method(sentence,data)

# Try LSI

~ ~ ~ ~ ~ ~ ~

# Word Mover Distance


When similar words are used, cosine similarity can be zero when it reality the text is similar, to avoid this WMD is used taking the word similarity in embedding space

In [None]:
def tokens_to_fracdict(tokens):
    cntdict = defaultdict(lambda : 0)
    for token in tokens:
        cntdict[token] += 1
    totalcnt = sum(cntdict.values())
    return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()}

def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, lpFile=None):
    all_tokens = list(set(first_sent_tokens+second_sent_tokens))
    wordvecs = {token: wvmodel[token] for token in all_tokens}

    first_sent_buckets = tokens_to_fracdict(first_sent_tokens)
    second_sent_buckets = tokens_to_fracdict(second_sent_tokens)

    T = pulp.LpVariable.dicts('T_matrix', list(product(all_tokens, all_tokens)), lowBound=0)

    prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
    prob += pulp.lpSum([T[token1, token2]*euclidean(wordvecs[token1], wordvecs[token2])
                        for token1, token2 in product(all_tokens, all_tokens)])
    for token2 in second_sent_buckets:
        prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2]
    for token1 in first_sent_buckets:
        prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1]

    if lpFile!=None:
        prob.writeLP(lpFile)

    prob.solve()

    return prob

In [None]:
def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, lpFile=None):
    prob = word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, lpFile=lpFile)
    return pulp.value(prob.objective)

In [None]:
toks = []
for k in range(len(data)):
    cur = data[k].lower().split()
    for val in range(len(cur)):
        if cur[val] not in toks:
            toks.append(cur[val])

In [None]:
wvmodel = gensim.models.Word2Vec([toks],min_count=1,size=32)

In [None]:
def preprocess(raw_text):
#     letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)
    words = raw_text.lower().split()
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))
    return cleaned_words
def find_wmd(s1, s2):
    s1 = preprocess(s1)
    len_ = []
    for i in range(len(s2)):
        
        doc = preprocess(s2[i])
        len_.append(word_mover_distance(s1, doc, wvmodel))
    return max(len_), s2[len_.index(max(len_))]

In [None]:
find_wmd(sentence, data)

# lda

~ ~ ~ ~ ~ ~ ~ ~ ~ 

# VAE

~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 

# InferSent

InferSent is a sentence embeddings method that provides semantic sentence representations. It is trained on natural language inference data and generalizes well to many different tasks. \
<b>Original paper</b>: https://research.fb.com/wp-content/uploads/2017/09/emnlp2017.pdf \
<b>Architecture</b>: Attention based Ecoder Decoder Bi-LSTM \
<b>Codes</b>: https://github.com/facebookresearch/InferSent \
<b>Data</b>:The SNLI corpus (version 1.0) is a collection of 570k human-written English sentence pairs manually labeled for balanced classification with the labels entailment, contradiction, and neutral, supporting the task of natural language inference (NLI), also known as recognizing textual entailment. (RTE)  


(An <b>entailment</b> is a deduction or implication, that is, something that follows logically from or is implied by something else. In logic, an entailment is the relationship between sentences whereby one sentence will be true if all the others are also true.)

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from models import InferSent
V = 1
MODEL_PATH = '../models/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

In [None]:
W2V_PATH = '../data/glove.840B.300d.txt'
infersent.set_w2v_path(W2V_PATH)

In [None]:
infersent.build_vocab(data, tokenize=True)

In [None]:
embeddings = infersent.encode(data, tokenize=True)

In [None]:
def get_max(no=3, dict_=sent):
    sent_importance = {}
    for i in range(len(y)):
        if sent[i]=='<s>' or sent[i]=='</s>':
            pass
        else:
            sent_importance.update({sent[i]: y[i]})
    sort = sorted(sent_importance, key=sent_importance.get, reverse=True)
    count = 0
    for r in sort:
        if count<no:
            count += 1
            print(r, sent_importance[r])

In [None]:
vector, index, y, sent = infersent.visualize(sentence, tokenize=True) 

In [None]:
get_max()