# Class work. Word2Vec

In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import nltk
from pprint import pprint
from collections import defaultdict
import string
import gensim

## 20newsgroups dataset

In [2]:
newsgroups = fetch_20newsgroups(categories=['comp.graphics',
                                            'comp.os.ms-windows.misc',
                                            'comp.sys.ibm.pc.hardware',
                                            'comp.sys.mac.hardware',
                                            'comp.windows.x',
                                            'rec.autos',
                                            'rec.motorcycles',
                                            'rec.sport.baseball',
                                            'rec.sport.hockey',
                                            'sci.electronics',
                                            'sci.space',],
                                subset='all', shuffle=True, random_state=1)

Список категорий новостей

In [3]:
pprint(list(newsgroups.target_names))

['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.electronics',
 'sci.space']


Размерность корпуса

In [4]:
print("%d documents" % len(newsgroups.data))
print("%d categories" % len(newsgroups.target_names))

10841 documents
11 categories


## Tokenization

Токенизируем все тексты и записываем токены в отдельный лист

In [5]:
newsgroups_tokenized = [nltk.word_tokenize(text) for text in newsgroups.data]

In [6]:
news_corpus = []
for text in newsgroups_tokenized:
    lemms = []
    for word in text:
        if word not in string.punctuation:
            lemms.append(word.lower())
    news_corpus.append(lemms)

## Word2Vec

Обучаем модель Word2Vec

In [7]:
news_model = gensim.models.Word2Vec(news_corpus)

In [8]:
print(news_model['car'], '\n')
print('Vector dimension: ', len(news_model['car']))

[ 0.86284512  1.32959127 -0.63757163 -0.67391026  0.44544992  0.37657425
 -1.08244789  0.81196666 -1.28120613  0.53626126  0.47280663 -1.24560785
 -1.88047421  0.24283862  3.00426722  2.74162865  0.22118583  0.40537328
 -0.88165981 -1.51376581 -0.04827534  2.13019633 -0.25792608 -0.49964216
  0.99564785 -1.57279515  0.36676514 -0.38762116  0.71266955 -0.40570667
 -0.81460887  1.1182065   1.82321894 -1.97550213  2.33990288 -0.76092732
  1.19684482  0.22485575  0.81241393 -0.15121605 -1.31710064  1.74657273
 -0.08110818 -0.80954444  0.85473961 -1.37125468  0.12182475  0.59764308
 -1.84924841 -0.48604196 -0.19844404  1.12498403 -1.57744265  0.62643737
 -0.03319849 -1.65937066  0.55175132  0.59766781  0.85889471 -0.73636234
 -0.87767369 -0.18539508  1.9442811   0.98732394  2.80523229  0.17642368
  2.92957211 -1.32652199 -1.54979289 -0.25753602  1.80320477 -1.03620493
  0.62810361  0.90728986 -0.3382017   0.40551695 -0.8048231   1.85658944
 -0.66298455  1.31292534 -0.21618854 -0.83572096  1

## Word Similarity

Выведем схожесть некоторых слов

In [9]:
news_model.similarity('windows', 'mac')

0.62839250880873687

In [10]:
news_model.similarity('car', 'motorcycle')

0.72648798763671996

In [11]:
news_model.similarity('baseball', 'hockey')

0.83891897321527664

In [12]:
news_model.similarity('ball', 'puck')

0.91104751000107964

## Лишнее слово

In [13]:
news_model.doesnt_match(['car', 'motorcycle', 'bicycle'])

'bicycle'

In [14]:
news_model.doesnt_match(['baseball', 'hockey', 'telephone', 'basketball', 'athletics'])

'telephone'

In [15]:
news_model.doesnt_match(['earth', 'jupyter', 'mars', 'sun'])

'sun'

Word2Vec довольно хорошо справляется с определением лишнего слова

## Word operations

In [16]:
news_model.most_similar(positive=['man', 'crown'])

[('genius', 0.808940052986145),
 ('blood', 0.8032683730125427),
 ('bandit', 0.7956812977790833),
 ('dude', 0.7886248230934143),
 ('denizen', 0.7863993644714355),
 ('daughter', 0.7792149782180786),
 ('panther', 0.7787203788757324),
 ('blinn', 0.7695774435997009),
 ('batter-runner', 0.7688409090042114),
 ('slam', 0.7682461142539978)]

In [17]:
news_model.most_similar(positive=['earth', 'pluto', 'mars'], negative = ['planet'])

[('lunar', 0.9134441018104553),
 ('missions', 0.8533686399459839),
 ('orbit', 0.8526050448417664),
 ('jupiter', 0.8491303324699402),
 ('launch', 0.830557644367218),
 ('manned', 0.8266162872314453),
 ('mission', 0.8213434815406799),
 ('satellite', 0.8193739652633667),
 ('comet', 0.8170713782310486),
 ('venus', 0.7983837127685547)]

In [20]:
news_model.most_similar(positive=['seat', 'wheels', 'steer'])

[('warm', 0.854910135269165),
 ('rubber', 0.844850480556488),
 ('tire', 0.8221921920776367),
 ('pedal', 0.8221838474273682),
 ('arms', 0.809646487236023),
 ('brakes', 0.8081234693527222),
 ('wheel', 0.8060007095336914),
 ('panic', 0.8052929639816284),
 ('bicycle', 0.8042505979537964),
 ('train', 0.803473174571991)]