# Exemplo de Word2Vec com modelo pré-treinado
* Exemplo adaptado de https://colab.research.google.com/drive/1zuq1I_FudtB2W4OSOWff8ODqfqK8d9-G



# Download do modelo pré-treinado (~1.5 gb)

In [None]:
import numpy as np

In [None]:
#!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# https://drive.google.com/file/d/1YW1srqUZI9OhX34RMFGsq7jEcsZ1v31p/view?usp=sharing
!wget https://zenodo.org/api/files/ce27e83b-fa32-42a7-83bd-60f34ea1e318/GoogleNews-vectors-negative300.bin.gz


2024-09-13 13:59:35 (31.4 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [None]:
!sleep 6000

^C


# Instalando a biblioteca "gensim" para manipular word vectors

In [None]:
!pip install gensim
from gensim.models import KeyedVectors



# Carregando o modelo pré-treinado

In [None]:
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz' # from above
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [None]:
word2vec["virus"]

array([ 1.88476562e-01, -1.57470703e-02, -7.17163086e-03,  3.71093750e-02,
       -3.06640625e-01,  2.53906250e-01, -3.10546875e-01,  5.54199219e-02,
        2.57812500e-01, -3.27148438e-02,  1.22070312e-01, -5.58593750e-01,
        1.94335938e-01, -1.65039062e-01, -2.06054688e-01,  2.48046875e-01,
       -6.34765625e-02,  3.47656250e-01,  1.74804688e-01, -6.93359375e-02,
        6.29882812e-02,  1.17187500e-01,  1.87500000e-01, -1.19140625e-01,
        3.27148438e-02, -1.95312500e-01,  4.71191406e-02, -3.08593750e-01,
        3.59375000e-01,  6.54296875e-02, -1.39648438e-01, -3.59375000e-01,
        4.35546875e-01, -9.42382812e-02, -2.82287598e-03, -8.10546875e-02,
       -4.12109375e-01,  1.31835938e-01, -5.10253906e-02,  5.89843750e-01,
        7.47070312e-02, -1.47460938e-01, -1.89453125e-01,  2.28515625e-01,
        6.15234375e-02, -6.29882812e-02,  3.67187500e-01,  5.00488281e-03,
       -3.22265625e-01,  1.31835938e-01,  7.91015625e-02, -4.00390625e-02,
       -6.59179688e-02,  

In [None]:
word2vec["smartphone"].shape

(300,)

In [None]:
word2vec["covid"]

KeyError: "Key 'covid' not present"

# Vamos usar a similaridade de cosseno para calcular a proximidade entre palavras

Recall that $\text{cos}(a,b) = \frac{a \cdot b}{||a|| \cdot  ||b||}$

In [None]:
import numpy as np

def cos(x1, x2):
  return np.dot(x1, x2)/(np.linalg.norm(x1)*np.linalg.norm(x2))

# Testando similaridades...

In [None]:
cos(word2vec["smartphone"], word2vec["smartphones"])

0.8648069

In [None]:
cos(word2vec["smartphone"], word2vec["smart"])

0.2744388

In [None]:
cos(word2vec["smartphone"], word2vec["phone"])

0.4584843

In [None]:
cos(word2vec["smartphone"], word2vec["mobile"])

0.6600653

In [None]:
cos(word2vec["smartphone"], word2vec["dog"])

0.1979003

In [None]:
cos(word2vec["pizza"], word2vec["cluster"])

-0.015944283

In [None]:
cos(word2vec["easy"], word2vec["hard"])

0.4709632

# Similaridade de Textos com Word Mover Distance

In [None]:
!pip install POT

Collecting POT
  Downloading POT-0.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Downloading POT-0.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (835 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/835.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m835.4/835.4 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.4


In [None]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]


sentence_obama = 'Obama speaks to the media in Illinois'
sentence_president = 'The president greets the press in Chicago'

sentence_obama = preprocess(sentence_obama)
sentence_president = preprocess(sentence_president)


distance = word2vec.wmdistance(sentence_obama, sentence_president)
print('distance = %.4f' % distance)

distance = 1.0175


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
sentence_orange = preprocess('Oranges are my favorite fruit')
distance = word2vec.wmdistance(sentence_obama, sentence_orange)
print('distance = %.4f' % distance)


# Exemplo de Representação com Word2Vec

# Dataset

In [None]:
!wget https://raw.githubusercontent.com/rmarcacini/text-collections/master/complete_texts_csvs/Dmoz-Health.csv

--2024-09-13 14:10:40--  https://raw.githubusercontent.com/rmarcacini/text-collections/master/complete_texts_csvs/Dmoz-Health.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1182768 (1.1M) [text/plain]
Saving to: ‘Dmoz-Health.csv’


2024-09-13 14:10:40 (111 MB/s) - ‘Dmoz-Health.csv’ saved [1182768/1182768]



In [None]:
import pandas as pd

dataset = pd.read_csv('Dmoz-Health.csv')

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(dataset, test_size=0.33, random_state=42)

In [None]:
df_train

Unnamed: 0,file_name,text,class
1647,and_Diseases_1594311.txt,National Eye Institute - The Cornea and Cornea...,Conditions
2589,Health_1614027.txt,Alison Muir - B.A. Hons. Psych. Psychologist l...,Mental
1700,and_Diseases_1601766.txt,Talk About Sleep A sleep health community prov...,Conditions
5201,Health_and_Safety_1625559.txt,Arizona Emergency Medical Systems For over 25 ...,Public
6004,Health_1628690.txt,Delaware Nursing Care Center Skilled nursing a...,Senior
...,...,...,...
3772,1619360.txt,Society for Nutrition Education (SNE) For prof...,Nutrition
5191,Health_and_Safety_1624499.txt,"Wabasha Fire Department Stations, equipment, f...",Public
5226,Health_and_Safety_1626228.txt,University of Saskatchewan: Department of Comm...,Public
5390,Health_and_Safety_1623932.txt,Gravenhurst Volunteer Fire Department The GVFD...,Public


In [None]:
df_test

Unnamed: 0,file_name,text,class
3106,1618765.txt,Nurse CEU.com: Pediatrics List of links to Con...,Nursing
6161,Health_1629048.txt,Dementia: A Guide For The Caregiver A guide an...,Senior
1867,and_Diseases_1596988.txt,"Congenital Toxoplasmosis Includes symptoms, di...",Conditions
3238,1618104.txt,MUSC College of Nursing Medical University of ...,Nursing
5509,Health_1627315.txt,"Malpani Infertility Clinic Located in Bombay, ...",Reproductive
...,...,...,...
2917,Health_1614367.txt,"Elizabeth Fadale, LMHC Private practice locate...",Mental
596,1583133.txt,Human Hand - PalmTherapy A review of Palm Ther...,Alternative
6443,Health_1628680.txt,Connecticut Elder Reference Guide Infoline 211...,Senior
1068,1583908.txt,Holistic Stock Health Holistic health care for...,Animal


In [None]:
# Filtrando o df_train para as classes Senior e Mental
df_train = df_train[df_train['class'].isin(['Senior', 'Mental'])]
df_train['class'] = df_train['class'].map({'Senior': 0, 'Mental': 1})

# Filtrando o df_test para as classes Senior e Mental
df_test = df_test[df_test['class'].isin(['Senior', 'Mental'])]
df_test['class'] = df_test['class'].map({'Senior': 0, 'Mental': 1})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['class'] = df_train['class'].map({'Senior': 0, 'Mental': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['class'] = df_test['class'].map({'Senior': 0, 'Mental': 1})


In [None]:
df_train

Unnamed: 0,file_name,text,class
2589,Health_1614027.txt,Alison Muir - B.A. Hons. Psych. Psychologist l...,1
6004,Health_1628690.txt,Delaware Nursing Care Center Skilled nursing a...,0
2582,Health_1617262.txt,Behavior OnLine Chat Events Topics of interest...,1
2939,Health_1613963.txt,Counsellingsolution.com Provides online counse...,1
6056,Health_1628543.txt,"Frontier Management Provides assisted living, ...",0
...,...,...,...
6396,Health_1628976.txt,National Family Caregivers Association (NFCA) ...,0
6420,Health_1628833.txt,Rockland Independent Living Center Provides se...,0
2919,Health_1614129.txt,Dr. Justin D'Arienzo Private practice located ...,1
6231,Health_1628916.txt,Individual Care of Texas Residential care and ...,0


In [None]:
doc_embeddings = []
for index,row in df_train.iterrows():

  sentence = preprocess(row['text'])

  L = []
  for token in sentence:
    try:
      L.append(word2vec[token])
    except:
      1
  if len(L) > 0: embedding = np.mean(np.array(L),axis=0)
  else: embedding = np.zeros(300)
  doc_embeddings.append(embedding)

In [None]:
df_train['embeddings'] = doc_embeddings
df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['embeddings'] = doc_embeddings


Unnamed: 0,file_name,text,class,embeddings
2589,Health_1614027.txt,Alison Muir - B.A. Hons. Psych. Psychologist l...,1,"[-0.019878387, -0.08330536, 0.09169197, 0.0067..."
6004,Health_1628690.txt,Delaware Nursing Care Center Skilled nursing a...,0,"[-0.094615586, 0.04864155, -0.009098399, 0.023..."
2582,Health_1617262.txt,Behavior OnLine Chat Events Topics of interest...,1,"[-0.018705368, 0.08302307, 0.0301857, 0.070373..."
2939,Health_1613963.txt,Counsellingsolution.com Provides online counse...,1,"[0.00390625, -0.025928844, -0.0884174, 0.04351..."
6056,Health_1628543.txt,"Frontier Management Provides assisted living, ...",0,"[-0.012316895, 0.021087646, 0.0039985655, 0.00..."
...,...,...,...,...
6396,Health_1628976.txt,National Family Caregivers Association (NFCA) ...,0,"[-0.032889556, -0.113256834, -0.10136108, 0.07..."
6420,Health_1628833.txt,Rockland Independent Living Center Provides se...,0,"[-0.029052734, -0.048864745, 0.011779785, 0.05..."
2919,Health_1614129.txt,Dr. Justin D'Arienzo Private practice located ...,1,"[-0.09728088, -0.085064694, 0.019372558, 0.050..."
6231,Health_1628916.txt,Individual Care of Texas Residential care and ...,0,"[-0.049951173, 0.045435525, 0.01451912, 0.0636..."


In [None]:
df_train.loc[6265].embeddings

array([-1.25805661e-01, -3.62792984e-02,  1.93969719e-02,  6.50634756e-03,
       -6.78375214e-02,  3.17382801e-04,  7.52929673e-02, -9.16015655e-02,
        4.52957153e-02, -2.58010868e-02,  2.42218021e-02, -1.17795564e-01,
       -4.11376953e-02, -3.38378921e-02,  1.67236328e-02, -5.60913095e-03,
       -2.77862549e-02,  1.77484125e-01,  1.85791012e-02,  3.82080078e-02,
        1.05004884e-01, -8.68530273e-02,  1.28588870e-01,  5.78613300e-03,
       -3.52905281e-02,  2.43591312e-02, -1.99719235e-01,  1.04331970e-01,
       -2.80761713e-04, -1.39450073e-01, -2.31323252e-03, -2.74047852e-02,
       -1.56884760e-01, -2.55371090e-02, -4.10156250e-02,  4.12910469e-02,
        3.29589844e-02,  1.35253910e-02, -1.57104488e-02,  6.38183579e-02,
        2.44873054e-02,  1.49047850e-02, -1.24877933e-02, -5.49850464e-02,
        1.17187500e-02, -1.15844727e-01,  6.21917732e-02,  9.39453095e-02,
        3.14941397e-03, -2.42797844e-02, -7.08251968e-02,  5.82519546e-02,
       -1.91406254e-02,  

In [None]:
word2vec.similar_by_vector(df_train.loc[6265].embeddings)

[('care', 0.7064115405082703),
 ('Mary_Ann_Neureiter', 0.5920262336730957),
 ('adult', 0.5907431840896606),
 ('Developmentally_disabled', 0.5893610119819641),
 ('Each_SunLink', 0.5639613270759583),
 ('daycare', 0.5576798915863037),
 ('Dog_mauls', 0.5573028922080994),
 ('residential_habilitation', 0.552916944026947),
 ('Marci_Shatzman_writes', 0.5507700443267822),
 ('Volunteer_ombudsmen', 0.5498595237731934)]

In [None]:
doc_embeddings = []
for index,row in df_test.iterrows():

  sentence = preprocess(row['text'])

  L = []
  for token in sentence:
    try:
      L.append(word2vec[token])
    except:
      1
  if len(L) > 0: embedding = np.mean(np.array(L),axis=0)
  else: embedding = np.zeros(300)
  doc_embeddings.append(embedding)

df_test['embeddings'] = doc_embeddings
df_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['embeddings'] = doc_embeddings


Unnamed: 0,file_name,text,class,embeddings
6161,Health_1629048.txt,Dementia: A Guide For The Caregiver A guide an...,0,"[0.029683795, -0.15030344, -0.0672433, -0.0582..."
6167,Health_1628549.txt,Adventist Care Centers Providing long term car...,0,"[-0.12358941, 0.03048494, 0.0008884006, 0.0761..."
6047,Health_1628453.txt,Senior Health Week News and information for se...,0,"[-0.04621582, 0.04321289, -0.0037597655, -0.07..."
2952,Health_1617358.txt,Vista Continuing Education Online courses for ...,1,"[0.024510702, -0.0061149597, -0.019989014, 0.0..."
6341,Health_1628674.txt,Jefferson House Providing long-term care and r...,0,"[-0.06032715, 0.06895752, -0.026965331, 0.0742..."
...,...,...,...,...
6436,Health_1628662.txt,"Villa Mirage Elderly board and care, 6-bedhous...",0,"[-0.043999568, 0.060709637, 0.09236654, 0.0222..."
6070,Health_1628688.txt,Wilton Meadows Rehabilitation and Health Care ...,0,"[-0.10127397, 0.15101208, 0.06585138, 0.003129..."
6367,Health_1628559.txt,Mountain West Retirement Corporation Retiremen...,0,"[-0.02240843, 0.046975527, 0.018865248, 0.1053..."
2917,Health_1614367.txt,"Elizabeth Fadale, LMHC Private practice locate...",1,"[-0.040920258, -0.026824951, 0.045928955, 0.04..."


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3,metric="cosine")
knn.fit(np.array(df_train.embeddings.to_list()), df_train['class'])
y_pred = knn.predict(np.array(df_test.embeddings.to_list()))
y_pred

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0,

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df_test['class'], y_pred))

              precision    recall  f1-score   support

           0       0.94      0.93      0.93       174
           1       0.92      0.94      0.93       163

    accuracy                           0.93       337
   macro avg       0.93      0.93      0.93       337
weighted avg       0.93      0.93      0.93       337



# Operacões nas embeddings

In [None]:
emb_king = word2vec['king']
emb_queen = word2vec['queen']

In [None]:
emb_sum = (emb_king+emb_queen)/2

In [None]:
word2vec.similar_by_vector(emb_sum)

[('queen', 0.9126338362693787),
 ('king', 0.904472827911377),
 ('monarch', 0.704138457775116),
 ('kings', 0.6757685542106628),
 ('princess', 0.6753551363945007),
 ('queens', 0.6710299253463745),
 ('prince', 0.6427620649337769),
 ('royal', 0.598995566368103),
 ('princes', 0.5936275720596313),
 ('crown_prince', 0.5886804461479187)]