In [3]:
str1 = "Karachi is one of the most congested city in the world."
str2 = "Karachi is the financial capital of Pakistan."
str3 = "Islamabad is the capital of Pakistan."

In [4]:
strlist = [str1,str2,str3]                                                                                      #list of all three string elements
print(strlist)

['Karachi is one of the most congested city in the world.', 'Karachi is the financial capital of Pakistan.', 'Islamabad is the capital of Pakistan.']


In [5]:
stopword = set("of the a is in".split())                                                                        #define our own stop words (not i=use dictonary)
print(stopword)                                                                                                 #print stop words

{'the', 'a', 'in', 'is', 'of'}


In [None]:
str1.split()                                                                                                    #Tokenized sentence/ word by word split

In [8]:
s = []
for str in strlist:                                                                                             #every string in string lst
    s_list = [word for word in str.split() if word not in stopword]                                             #store all the words that are not stop words
    print(s_list)
    str_ = ' '.join(s_list)                                                                                     #join words in a sentence without stop words 
    s.append(str_) 
print(s) 

['Karachi', 'one', 'most', 'congested', 'city', 'world.']
['Karachi', 'financial', 'capital', 'Pakistan.']
['Islamabad', 'capital', 'Pakistan.']
['Karachi one most congested city world.', 'Karachi financial capital Pakistan.', 'Islamabad capital Pakistan.']


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
cnt_vectorizer = CountVectorizer()                                                                              #term frequency --> in 1/0 form (unigram)
cnt_vectorizer.fit(s)                                                                                           #s= list after removing stop word     
print(cnt_vectorizer.vocabulary_)                                                                               #print vocabulary --> total vocabulary = 1 (0-9)

{'karachi': 5, 'one': 7, 'most': 6, 'congested': 2, 'city': 1, 'world': 9, 'financial': 3, 'capital': 0, 'pakistan': 8, 'islamabad': 4}


In [8]:
print(cnt_vectorizer.get_feature_names())                                                                       #print in order

['capital', 'city', 'congested', 'financial', 'islamabad', 'karachi', 'most', 'one', 'pakistan', 'world']


In [9]:
vec1 = cnt_vectorizer.transform(s).toarray()                                                                    #size/dimension of Term Document Frequency (TDM)
print(vec1.shape)

(3, 10)


In [10]:
print(vec1)                                                                                                     #Print vector --> no capital in sentence 1

[[0 1 1 0 0 1 1 1 0 1]
 [1 0 0 1 0 1 0 0 1 0]
 [1 0 0 0 1 0 0 0 1 0]]


In [11]:
cs1 = cosine_similarity(vec1)                                                                                   #cosine similarity of the vector
print(cs1)                                                                                                      #Highest similarity = 0.6 of sentence 2&3 

[[1.         0.20412415 0.        ]
 [0.20412415 1.         0.57735027]
 [0.         0.57735027 1.        ]]


In [12]:
import math
angle_in_radians = math.acos(0.20412415)
print(math.degrees(angle_in_radians))

78.22176756638345


In [13]:
new_str = "Lahore is the second largest city of Pakistan and an important financial center."                    #predict with which sentence is this similar

n_list = [word for word in new_str.split() if word not in stopword]
new_str = ' '.join(n_list)

print(new_str)                                                                                                  #Sentence after removing stop words

Lahore second largest city Pakistan and an important financial center.


In [14]:
response = cnt_vectorizer.transform([new_str])  
print(response)                                                                                                 #1st, 3rd & 8th (from above 3 sentence --> train) word exist in this sentence  **won't add any new word from test sentence

  (0, 1)	1
  (0, 3)	1
  (0, 8)	1


In [15]:
response.toarray()                                                                                               #similarity on the words that are part of dictinary

array([[0, 1, 0, 1, 0, 0, 0, 0, 1, 0]], dtype=int64)

In [16]:
answer = cosine_similarity(response,vec1)
print(answer)                                                                                                   #highest similarity with 2nd sentence

[[0.23570226 0.57735027 0.33333333]]


In [17]:
import numpy as np
ans = answer[0]
print("the maximum is ", max(ans))                                                                              #highest similarity

b = np.argmax(ans)
print(b, strlist[b])                                                                                            #sentence closest to

the maximum is  0.5773502691896258
1 Karachi is the financial capital of Pakistan.


In [10]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))                                                         #BiGram --> pair of consective words + uniGram
bigram_vectorizer.fit(s)                                                                                        #after stop word removed
print(bigram_vectorizer.get_feature_names())                                                                    #sorted

['capital', 'capital pakistan', 'city', 'city world', 'congested', 'congested city', 'financial', 'financial capital', 'islamabad', 'islamabad capital', 'karachi', 'karachi financial', 'karachi one', 'most', 'most congested', 'one', 'one most', 'pakistan', 'world']


In [11]:
vec2 = bigram_vectorizer.transform(s).toarray()
print(vec2.shape)

(3, 19)


In [12]:
cs2 = cosine_similarity(vec2)
print(cs2)

[[1.         0.11396058 0.        ]
 [0.11396058 1.         0.50709255]
 [0.         0.50709255 1.        ]]


In [15]:
#If not None, build a vocabulary that only consider the top max_features ordered by term frequency 
#across the corpus.
bigram_vectorizer2 = CountVectorizer(ngram_range=(1, 2), max_features=8)                                                #lock feature set --> based on term frequency
bigram_vectorizer2.fit(s)
print(bigram_vectorizer2.get_feature_names())

['capital', 'capital pakistan', 'karachi', 'most', 'most congested', 'one', 'one most', 'pakistan']


In [21]:
vec3 = bigram_vectorizer2.transform(s).toarray()
print(vec3.shape)                                                                                                       #8 max features = 8 words = 8 dimension

(3, 8)


In [23]:
cs3 = cosine_similarity(vec3)
print(cs3)

[[1.        0.2236068 0.       ]
 [0.2236068 1.        0.8660254]
 [0.        0.8660254 1.       ]]
