In [1]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt') # if necessary...


stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]


print(cosine_sim('a little bird', 'a little bird'))
print(cosine_sim('a little bird', 'a little bird chirps'))
print(cosine_sim('a little bird', 'a big dog barks'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adira\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0.9999999999999998
0.7092972666062738
0.0




In [2]:
corpus = [  'data science is one of the most important fields of science',
            'this is one of the best data science courses',
            'data science courses are in demand'   ]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a matrix to represent the corpus
X = CountVectorizer().fit_transform(corpus).toarray()

print(X)

[[0 0 0 1 0 1 1 0 1 1 2 1 2 1 0]
 [0 1 1 1 0 0 0 0 1 0 1 1 1 1 1]
 [1 0 1 1 1 0 0 1 0 0 0 0 1 0 0]]


In [4]:
import numpy as np

def cosine_similarity(x, y):
    
    # Ensure length of x and y are the same
    if len(x) != len(y) :
        return None
    
    # Compute the dot product between x and y
    dot_product = np.dot(x, y)
    
    # Compute the L2 norms (magnitudes) of x and y
    magnitude_x = np.sqrt(np.sum(x**2)) 
    magnitude_y = np.sqrt(np.sum(y**2))
    
    # Compute the cosine similarity
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)
    
    return cosine_similarity

In [5]:
cos_sim_1_2 = cosine_similarity(X[0, :], X[1, :])
cos_sim_1_3 = cosine_similarity(X[0, :], X[2, :])
cos_sim_2_3 = cosine_similarity(X[1, :], X[2, :])

print('Cosine Similarity between: ')
print('\tDocument 1 and Document 2: ', cos_sim_1_2)
print('\tDocument 1 and Document 3: ', cos_sim_1_3)
print('\tDocument 2 and Document 3: ', cos_sim_2_3)

Cosine Similarity between: 
	Document 1 and Document 2:  0.6885303726590962
	Document 1 and Document 3:  0.31622776601683794
	Document 2 and Document 3:  0.4082482904638631
