In [183]:
import numpy as np
import pandas as pd
import math

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
stop_words = list(set(stopwords.words('english')))
sno = SnowballStemmer('english')

## Training Data

In [3]:
train_df = pd.read_csv('train.csv')
train_df.drop(['id'],1,inplace=True)
train_df = train_df.dropna()

In [123]:
train_question1 = [word_tokenize(i.lower().decode('utf8')) for i in train_df['question1'][0:10000]]
train_question2 = [word_tokenize(i.lower().decode('utf8')) for i in train_df['question2'][0:10000]]

In [124]:
train_q1 = [' '.join(w) for w in train_question1 if w not in stop_words]
train_q2 = [' '.join(w) for w in train_question2 if w not in stop_words]

In [125]:
train_set = zip(train_q1,train_q2)

## Testing Data

In [52]:
test_df = pd.read_csv('test.csv')
test_df.drop(['test_id'],1,inplace=True)
test_df = test_df.dropna()

In [126]:
test_question1 = [word_tokenize(i.lower().decode('utf8')) for i in test_df['question1'][0:10000]]
test_question2 = [word_tokenize(i.lower().decode('utf8')) for i in test_df['question2'][0:10000]]

In [127]:
test_q1 = [' '.join(w) for w in test_question1 if w not in stop_words]
test_q2 = [' '.join(w) for w in test_question2 if w not in stop_words]

In [128]:
test_set = zip(test_q1,test_q2)

## Transformation

In [130]:
quora = zip(train_set,test_set)

In [129]:
count_vectorizer = CountVectorizer()

In [141]:
matrix_transfrom = []
tf_idf_transform = []
for i in quora:
    train,test = i[0],i[1]
    count_vectorizer.fit_transform(train)
    freq_term_matrix = count_vectorizer.transform(test)
    matrix_transfrom.append(freq_term_matrix.todense())
    tfidf = TfidfTransformer(norm='l2')
    tfidf.fit(freq_term_matrix)
    #print 'IDF: ',tfidf.idf_
    tf_idf_matrix = tfidf.transform(freq_term_matrix)
    tf_idf_transform.append(tf_idf_matrix.todense())

In [180]:
for i in tf_idf_transform[:5]:
    #print(i)
    print(cosine_similarity(i))

[[ 1.  0.]
 [ 0.  0.]]
[[ 1.  0.]
 [ 0.  0.]]
[[ 1.  0.]
 [ 0.  0.]]
[[ 0.  0.]
 [ 0.  0.]]
[[ 0.  0.]
 [ 0.  0.]]


In [184]:
import math
def cosine_similarity_vectors(v1,v2):
    "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

v1,v2 = [3, 45, 7, 2], [2, 54, 13, 15]
print(v1, v2, cosine_similarity_vectors(v1,v2))

([3, 45, 7, 2], [2, 54, 13, 15], 0.97228425171235)


In [186]:
try:
    for i in tf_idf_transform:
        s = i.tolist()
        print(cosine_similarity_vectors(s[0],s[1]))
except Exception as e:
    print('ZeroDivisionError')

ZeroDivisionError
