In [3]:
# import the required packages
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

# Function that test different similarity metric algorithms
def test_algorithms(test_text):

    sentence_1 = "I love telling comic stories with my family"
    sentence_2 = "I dislike watching and playing soccer"
    sentence_3 = "When I was much younger, I used to read novels with my siblings"
    text_4 = test_text

    # Create a list from the sentences
    texts = [sentence_1, sentence_2, sentence_3, text_4]

    # Create a DataFrame with a column named as text
    df = pd.DataFrame(texts, columns=['text'])

    # Define a tfidf vectorizer and remove all stopwords.
    tfidf = TfidfVectorizer(stop_words="english")
    
    #Convert tfidf matrix by fitting and transforming the data.
    tfidf_matrix = tfidf.fit_transform(texts)

    # calculating the cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) 

    # calculating the euclidean distances matrix
    ed_sim = euclidean_distances(tfidf_matrix, tfidf_matrix)

    # calculating the manhattan distances matrix
    md_sim = manhattan_distances(tfidf_matrix, tfidf_matrix)
 
    #Construct a reverse map of indices and the texts
    indices = pd.Series(df.index, index=df['text']).drop_duplicates()

    # Get the index of the text that matches the sentences
    idx = indices[text_4]

    # Get the pairwsie similarity scores of all texts with that test text
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Get the scores of the 3 most similar texts in the cosine similarity scores
    sim_scores = sim_scores[0:3]
    print(sim_scores)
    print('\n')

    # Get the pairwsie similarity scores of all texts with that test text
    ed_sim_scores = list(enumerate(ed_sim[idx]))
    
    # Get the scores of the 3 most similar texts in the euclidean distances scores
    ed_sim_scores = ed_sim_scores[0:3]
    print(ed_sim_scores)
    print('\n')
    
    # Get the pairwsie similarity scores of all texts with that test text
    md_sim_scores = list(enumerate(md_sim[idx]))
    
    # Get the scores of the 3 most similar texts in the manhattan distances scores
    md_sim_scores = md_sim_scores[0:3]
    print(md_sim_scores)
    print('\n')

test_text = 'read'
test_algorithms(test_text)

[(0, 0.0), (1, 0.0), (2, 0.3667390112974172)]


[(0, 1.4142135623730951), (1, 1.4142135623730951), (2, 1.1253985860152684)]


[(0, 3.23606797749979), (1, 3.0), (2, 2.49390872279154)]


