In [1]:
#Importing Libraries
import numpy as np
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /Users/druk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/druk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Creating a corpus of sentences.

In [2]:
corpus = ['The sun is the largest celestial body in the solar system', 
          'The solar system consists of the sun and eight revolving planets', 
          'Ra was the Egyptian Sun God', 
          'The Pyramids were the pinnacle of Egyptian architecture', 
          'The quick brown fox jumps over the lazy dog']

#### Removing the stopwords.

In [3]:
stop = set(stopwords.words('english'))
for index, sentence in enumerate(corpus):
  corpus[index] = ' '.join([i for i in word_tokenize(sentence.lower()) if i not in stop])

In [4]:
corpus

['sun largest celestial body solar system',
 'solar system consists sun eight revolving planets',
 'ra egyptian sun god',
 'pyramids pinnacle egyptian architecture',
 'quick brown fox jumps lazy dog']

#### Creating the TF-IDF matrix

In [5]:
# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

#### Print the TF-IDF matrix

In [6]:
df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf_vectorizer.get_feature_names())
print(df)

   architecture      body     brown  celestial  consists       dog  egyptian  \
0      0.000000  0.458815  0.000000   0.458815  0.000000  0.000000  0.000000   
1      0.000000  0.000000  0.000000   0.000000  0.417016  0.000000  0.000000   
2      0.000000  0.000000  0.000000   0.000000  0.000000  0.000000  0.458270   
3      0.523358  0.000000  0.000000   0.000000  0.000000  0.000000  0.422242   
4      0.000000  0.000000  0.408248   0.000000  0.000000  0.408248  0.000000   

      eight       fox       god  ...      lazy  pinnacle   planets  pyramids  \
0  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000  0.000000   
1  0.417016  0.000000  0.000000  ...  0.000000  0.000000  0.417016  0.000000   
2  0.000000  0.000000  0.568014  ...  0.000000  0.000000  0.000000  0.000000   
3  0.000000  0.000000  0.000000  ...  0.000000  0.523358  0.000000  0.523358   
4  0.000000  0.408248  0.000000  ...  0.408248  0.000000  0.000000  0.000000   

      quick        ra  revolving     s



#### Compute the Cosine Similarity scores.

In [7]:
# compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.33489933 0.11688861 0.         0.        ]
 [0.33489933 1.         0.10623994 0.         0.        ]
 [0.11688861 0.10623994 1.         0.19350098 0.        ]
 [0.         0.         0.19350098 1.         0.        ]
 [0.         0.         0.         0.         1.        ]]
