# <font color='pink'>Sentence Distinctions</font>

By: Ameer Syedibrahim

### Import Statements

In [4]:
from collections import defaultdict
import numpy as np
import time, math, nltk, gensim, string
from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import TruncatedSVD
from nltk import word_tokenize



### Loading in Word Embeddings Model

In [5]:
path = "glove.6B.50d.txt.w2v"
t0 = time.time()
glove = KeyedVectors.load_word2vec_format(path, binary=False)
t1 = time.time()
print("Time Elapsed: " + str(t1 - t0) + "s")

Time Elapsed 22.985545873641968s


#### Example of a word's numerical, conceptually-informed representation
In this example, this vector of 50 numbers represents the word 'pink'

In [12]:
print("PINK")
print(glove.wv["pink"])

PINK
[ -1.72710001e-01   9.54169989e-01  -8.44640017e-01   1.18179999e-01
   8.76410007e-02   8.43370020e-01  -7.33470023e-01  -1.10930002e+00
  -2.56430000e-01   9.72340032e-02   1.29079996e-02   1.37009993e-01
   7.26750016e-01   3.31180006e-01   2.56839991e-01  -2.03779992e-02
  -5.47829986e-01  -6.91029988e-03  -3.81839991e-01  -1.36829996e+00
  -8.07910025e-01  -2.22609997e-01   6.60849988e-01  -6.94389999e-01
  -9.76220012e-01  -2.33370006e-01  -1.06280005e+00   2.00449991e+00
   4.27769989e-01  -1.44570005e+00   1.62930000e+00   7.37240016e-02
  -3.53700012e-01   5.58449984e-01  -2.82240003e-01   3.14090014e-01
   2.52099991e-01  -3.71149987e-01  -1.98949993e-01  -1.25520003e+00
   1.08089998e-01   8.82899985e-02  -4.71709996e-01  -7.04760015e-01
   8.46409976e-01  -4.31089997e-01   1.90270005e-03  -1.71399999e+00
  -1.19319998e-01  -5.12709975e-01]


In [6]:
n = len(glove.vocab)
d = glove.vector_size
X_glove = np.zeros((n, d))
for i, word in enumerate(glove.vocab.keys()):
    X_glove[i,:] = glove.wv[word]
print(X_glove.nbytes)

160000000


In [7]:
t0 = time.time()
svd = TruncatedSVD(n_components=2)
svd.fit(X_glove)
t1 = time.time()
print("elapsed " + str(t1 - t0) + "s")

elapsed 1.9745051860809326s


### Sentence Relation Calculator
If the outputted number is less that 14, the model recognizes them as related. If the number is greater than 14, then the model recognizes them as unrelated

In [10]:
def the_sent_threshold():
    
    str1 = input("Enter the first sentence: ")
    
    list1 = nltk.word_tokenize(str1)
    
    vector1list = []
    
    for word in list1:
        
        vector1list.append(glove.wv[word])
        
        
    vector1_array = np.array(vector1list)
    
    str2 = input("Enter the second sentence: ")
    
    list2 = nltk.word_tokenize(str2)
    
    vector2list = []
    
    for word in list2:
        
        vector2list.append(glove.wv[word])
        
        
    vector2_array = np.array(vector2list)
    
    diff = vector1_array - vector2_array
    
    
    
    thresh = math.sqrt(abs(np.sum(diff))*21)
    
    
    return thresh

#### Testing Sentence Relations

In [None]:
the_sent_threshold()

#### Tokenization and Filtration of Text

In [14]:
def tokenize(text, preserve_case=True, filter_text=True, stopwords=None, punc=string.punctuation):
    if filter_text and stopwords is None:
        with open("stopwords.txt", 'r') as r:
            stops = []
            for line in r:
                stops += [i.strip() for i in line.split('\t')]
    stopwords = stops
    tokens = word_tokenize(text)
    indices = []
    if filter_text:
        for i in range(len(tokens)):
            word = tokens[i]
            # print(word, word in punc, word.lower() in stopwords)
            if word in punc or word.lower() in stopwords:
                indices.append(i)
        tokens = list(np.delete(np.array(tokens), np.array(indices)))
    if preserve_case:
        return tokens
    return [i.lower() for i in a]

### Sentence Relation Calculator 2.0
An improved version of the sentence relation calculator that can accept sentences with different amount of words

In [15]:
def sent_threshold1():
    
    str1 = input("Enter the first sentence: ")
    
    list1 = tokenize(str1)
    
    vector1list = []
    
    for word in list1:
        
        vector1list.append(glove.wv[word])
        
        
    vector1_array = np.array(vector1list)
    
    str2 = input("Enter the second sentence: ")
    
    list2 = tokenize(str2)
    
    vector2list = []
    
    for word in list2:
        
        vector2list.append(glove.wv[word])
        
        
    vector2_array = np.array(vector2list)
    
    
    dim1 = vector1_array.shape[0]
    dim2 = vector2_array.shape[0]
    
    if dim1 > dim2 :
        
        diff = dim1 - dim2
        
        for i in range(diff):
            arr = np.mean(vector2_array,axis=0)
            
            vector2_array = np.vstack([vector2_array,arr])
            
    if dim2 > dim1 :
        
        diff = dim2 - dim1
        
        for i in range(diff):
            arr = np.mean(vector1_array,axis=0)
            
            vector1_array = np.vstack([vector1_array,arr])
            
    
    diff = vector1_array - vector2_array
    
    
    
    thresh = math.sqrt(abs(np.sum(diff))*21)
    
    
    return thresh

#### Testing Sentence Relations 2.0

In [None]:
sent_threshold1()