In [2]:
# !pip install scikit-learn
import sklearn
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [3]:
def create_table(lst,vect):
  matrix = vect.fit_transform(lst)
  return pd.DataFrame(matrix.toarray(),columns=vect.get_feature_names_out())

In [4]:
test = ["he likes to play football so so much","he also eats alot"]

cv = CountVectorizer()
create_table(test,cv)

Unnamed: 0,alot,also,eats,football,he,likes,much,play,so,to
0,0,0,0,1,1,1,1,1,2,1
1,1,1,1,0,1,0,0,0,0,0


In [5]:
test = ["he likes to play football so so much","he also eats alot"]

tfv = TfidfVectorizer()
create_table(test,tfv)

# TF-IDF (Term Frequency-Inverse Document Frequency):

# Term Frequency: TF-IDF considers not only the frequency of words in a document but also how important they are in the context of the entire corpus. It calculates the term frequency within a document (TF).
# Inverse Document Frequency: It also considers the inverse document frequency (IDF), which measures the rarity of a word across the entire corpus. Rare words are weighted more heavily.
# Weighted Representation: TF-IDF represents text data as a matrix where each row is a document, and each column is a unique word, but the values are weighted based on the TF-IDF score.
# Importance of Words: TF-IDF gives higher scores to words that are important in a specific document but not too common across all documents.

# tf - idf is better than countvectrizor but tf-idf Not the best


Unnamed: 0,alot,also,eats,football,he,likes,much,play,so,to
0,0.0,0.0,0.0,0.324336,0.230768,0.324336,0.324336,0.324336,0.648673,0.324336
1,0.534046,0.534046,0.534046,0.0,0.379978,0.0,0.0,0.0,0.0,0.0


**Word2vec**

In [6]:
# !pip install gensim
import gensim
from gensim.models import Word2Vec
import gensim.downloader as api

In [7]:
model = api.load('word2vec-google-news-300')



In [8]:
w  = model["hello"]
print("length of vector:",len(w))
print(w)

length of vector: 300
[-0.05419922  0.01708984 -0.00527954  0.33203125 -0.25       -0.01397705
 -0.15039062 -0.265625    0.01647949  0.3828125  -0.03295898 -0.09716797
 -0.16308594 -0.04443359  0.00946045  0.18457031  0.03637695  0.16601562
  0.36328125 -0.25585938  0.375       0.171875    0.21386719 -0.19921875
  0.13085938 -0.07275391 -0.02819824  0.11621094  0.15332031  0.09082031
  0.06787109 -0.0300293  -0.16894531 -0.20800781 -0.03710938 -0.22753906
  0.26367188  0.012146    0.18359375  0.31054688 -0.10791016 -0.19140625
  0.21582031  0.13183594 -0.03515625  0.18554688 -0.30859375  0.04785156
 -0.10986328  0.14355469 -0.43554688 -0.0378418   0.10839844  0.140625
 -0.10595703  0.26171875 -0.17089844  0.39453125  0.12597656 -0.27734375
 -0.28125     0.14746094 -0.20996094  0.02355957  0.18457031  0.00445557
 -0.27929688 -0.03637695 -0.29296875  0.19628906  0.20703125  0.2890625
 -0.20507812  0.06787109 -0.43164062 -0.10986328 -0.2578125  -0.02331543
  0.11328125  0.23144531 -0.0441

In [9]:
import numpy as np
w1= model['king']
w2 = model["queen"]
print(np.linalg.norm(w1-w2))

2.4796925


In [11]:
model.most_similar(positive=["women","king"],negative=["man"])

[('queen', 0.4827326238155365),
 ('queens', 0.466781347990036),
 ('kumaris', 0.4653734564781189),
 ('kings', 0.4558638632297516),
 ('womens', 0.422832190990448),
 ('princes', 0.4176960587501526),
 ('Al_Anqari', 0.41725507378578186),
 ('concubines', 0.4011078476905823),
 ('monarch', 0.3962482810020447),
 ('monarchy', 0.39430150389671326)]

In [12]:
model.doesnt_match("house garage store sea".split())

'sea'

In [14]:
model.similarity("iphone","android")

0.5633577

In [15]:
model.similarity("iphone","ipad")

0.7574499

In [16]:
model.most_similar("cat")

[('cats', 0.8099379539489746),
 ('dog', 0.760945737361908),
 ('kitten', 0.7464985251426697),
 ('feline', 0.7326234579086304),
 ('beagle', 0.7150582671165466),
 ('puppy', 0.7075453400611877),
 ('pup', 0.6934291124343872),
 ('pet', 0.6891531348228455),
 ('felines', 0.6755931973457336),
 ('chihuahua', 0.6709762215614319)]