In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import spacy
import re

In [2]:
#!python -m spacy download en_core_web_md

In [80]:
sentences = ["President Obama visited Beijing last week", "The first African-American US President arrived at Peking a few days ago",
             "Hamilton beats Button and wins the game", "Button beats Hamilton and wins the game", "Some women in white perform on a stage.",
             "Women who look alike are performing on a stage."]

In [None]:
ex_sentences = ["President Obama visited Beijing last week", "The first African-American US President arrived at Peking a few days ago",
             "Hamilton beats Button and wins the game", "Button beats Hamilton and wins the game", "Some women in white perform on a stage.",
             "Women who look alike are performing on a stage."]

In [81]:
def Jaccard(text1, text2, prints):
  intersection = set(text1.split()) & set(text2.split())
  union = len(set(text1.split()) | set(text2.split()))
  if prints:
    print(set(text1.split()))
    print(set(text2.split()))
    print("overlap: ", intersection)
    print("Total words: ", union)
  return(len(intersection)/union)

In [82]:
Jaccard(sentences[0], sentences[1], True)

{'Beijing', 'visited', 'Obama', 'last', 'President', 'week'}
{'first', 'US', 'days', 'African-American', 'Peking', 'at', 'few', 'a', 'arrived', 'ago', 'President', 'The'}
overlap:  {'President'}
Total words:  17


0.058823529411764705

In [83]:
Jaccard(sentences[2], sentences[3], True)

{'Hamilton', 'and', 'beats', 'Button', 'game', 'wins', 'the'}
{'Hamilton', 'and', 'beats', 'Button', 'game', 'wins', 'the'}
overlap:  {'Hamilton', 'and', 'beats', 'Button', 'game', 'wins', 'the'}
Total words:  7


1.0

In [84]:
Jaccard(sentences[4], sentences[5], True)

{'on', 'perform', 'a', 'stage.', 'white', 'women', 'Some', 'in'}
{'on', 'look', 'are', 'alike', 'Women', 'a', 'stage.', 'performing', 'who'}
overlap:  {'on', 'stage.', 'a'}
Total words:  14


0.21428571428571427

In [85]:
sentences = [sentence.lower() for sentence in sentences]

In [86]:
sentences = [re.sub(r'[^\w\s]',' ',s) for s in sentences]

In [87]:
sentences

['president obama visited beijing last week',
 'the first african american us president arrived at peking a few days ago',
 'hamilton beats button and wins the game',
 'button beats hamilton and wins the game',
 'some women in white perform on a stage ',
 'women who look alike are performing on a stage ']

In [88]:
# Load the spaCy model
nlp = spacy.load("en_core_web_md")
for i in range(len(sentences)):
  print(f"Original sentence: {sentences[i]}")

  # Lemmatize a sentence
  doc = nlp(sentences[i])
  sentences[i] = " ".join([word.lemma_ for word in doc])

  print(f"Lemmatized sentence: {sentences[i]}")
  print("")

Original sentence: president obama visited beijing last week
Lemmatized sentence: president obama visit beijing last week

Original sentence: the first african american us president arrived at peking a few days ago
Lemmatized sentence: the first african american us president arrive at peke a few day ago

Original sentence: hamilton beats button and wins the game
Lemmatized sentence: hamilton beats button and win the game

Original sentence: button beats hamilton and wins the game
Lemmatized sentence: button beat hamilton and win the game

Original sentence: some women in white perform on a stage 
Lemmatized sentence: some woman in white perform on a stage

Original sentence: women who look alike are performing on a stage 
Lemmatized sentence: woman who look alike be perform on a stage



In [77]:
Jaccard(sentences[4], sentences[5], True)

{'on', 'some', 'perform', 'a', 'white', 'women', 'stage', 'in'}
{'on', 'look', 'are', 'alike', 'a', 'stage', 'performing', 'women', 'who'}
overlap:  {'on', 'stage', 'a', 'women'}
Total words:  13


0.3076923076923077

In [63]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:
from gensim.parsing.preprocessing import remove_stopwords
for sentence in sentences:
  print(sentence)
  print(remove_stopwords(sentence))

president obama visit beijing last week
president obama visit beijing week
the first african american us president arrive at peke a few day ago
african american president arrive peke day ago
hamilton beats button and win the game
hamilton beats button win game
button beat hamilton and win the game
button beat hamilton win game
some woman in white perform on a stage
woman white perform stage
woman who look alike be perform on a stage
woman look alike perform stage


In [26]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)

In [27]:
tfidf_matrix.todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.4198708 , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.4198708 , 0.        ,
         0.4198708 , 0.        , 0.        , 0.        , 0.34430007,
         0.        , 0.        , 0.        , 0.        , 0.4198708 ,
         0.4198708 , 0.        , 0.        , 0.        , 0.        ],
        [0.29945328, 0.29945328, 0.        , 0.29945328, 0.        ,
         0.29945328, 0.29945328, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.29945328, 0.29945328, 0.29945328,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.29945328, 0.        , 0.24555597,
         0.        , 0.        , 0.20731522, 0.29945328, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        ,

# New section

In [None]:
cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

array([[0.07894199]])

In [None]:
cosine_similarity(tfidf_matrix[2], tfidf_matrix[3])

array([[1.]])

In [None]:
# Load the model with word embeddings
nlp = spacy.load('en_core_web_md')

In [None]:
nlp(sentences[0]).similarity(nlp(sentences[1]))

0.6559949871590907

In [None]:
nlp(sentences[2]).similarity(nlp(sentences[3]))

0.9999999729939644