# Gensim Word2Vec

Experimenting with gensim's word2vec function.

Gensim offers a function to calculate word embeddings. Let's give it a go.

The function requires a list of sentences (or an iterator that provides sentences). We have one of these in our patentdata classes.

In [3]:
# Imports and logging setup
from gensim.models import word2vec

import logging
import os
import pickle
import random

from patentdata.corpus import USPublications
# Probably need to move the patentcorpus.py file into the main patentdata directory
from patentdata.models.patentcorpus import LazyPatentCorpus, CorpusSentenceIterator

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load our list of G06 records
PIK = "G06records.data"

if os.path.isfile(PIK):
    with open(PIK, "rb") as f:
        print("Loading data")
        records = pickle.load(f)
        print("{0} records loaded".format(len(records)))
else:
    records = ds.get_records(["G", "06"])
    with open(PIK, "wb") as f:
        pickle.dump(records, f)
        
# Get data from 100 random descriptions across the data
records_random_sample = random.sample(records, 100)
print("Random sample of {0} records".format(len(records_random_sample)))
print(records_random_sample[0:5])

path = '/media/SAMSUNG1/Patent_Downloads'
ds = USPublications(path)

Loading data
554570 records loaded
Random sample of 100 records
[(1742828, '2008/I20080515.ZIP', 'I20080515/UTIL0115/US20080115098A1-20080515.ZIP'), (3522829, '2013/I20131010.tar', 'I20131010/UTIL0268/US20130268163A1-20131010.ZIP'), (361462, '2003/20030619.ZIP', '20030619/UTIL0115/US20030115545A1-20030619.ZIP'), (404248, '2003/20030227.ZIP', '20030227/UTIL0041/US20030041168A1-20030227.ZIP'), (1436460, '2007/I20070517.ZIP', './I20070517/UTIL0110/US20070110327A1-20070517.ZIP')]


In [4]:
sentences = CorpusSentenceIterator(ds, records_random_sample)

In [5]:
model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)

fname = "word2vec.gensim"
model.save(fname)

In [6]:
model.wv['computer']

array([-0.08228544,  0.40827942,  0.38828045,  0.11766636, -1.33201134,
        0.39875981,  0.58450681,  1.64149189,  1.27749991, -0.56231797,
        1.29045498,  0.82686812, -1.55906987, -1.01263404,  0.23187172,
        2.047508  ,  0.92824793,  1.20077562,  1.01113987, -1.93312538,
       -0.5147112 , -0.07425889, -1.58366537,  0.56043446, -0.67028916,
        0.66357875, -1.33009863, -1.4741348 , -1.11817682, -0.68970478,
        1.13403773,  2.48949432,  0.71434945,  1.32812977, -1.94922531,
       -0.08405445, -0.47719201, -0.08551191,  0.51998705, -0.5025264 ,
       -0.98951584,  2.09389877, -0.80806983,  1.1638484 , -1.53077948,
       -2.08595943, -0.29854685,  0.35186401,  1.19228816, -0.34122112,
        0.35245067, -0.97954828, -0.17293379,  0.15272909, -1.75471675,
        1.71701062,  0.94516444,  1.0955714 , -0.3248772 , -1.35574436,
       -1.24880087, -0.94688541,  0.75589716,  1.8528229 ,  0.49957079,
        0.81192172, -0.87324172, -1.28950131, -0.72631752,  0.34

In [8]:
model.wv.similarity("computer", "device")

0.52534036119191629

In [9]:
model.wv.similarity("method", "five")

0.073458365134867781

In [10]:
model.wv.most_similar(positive=['computer'])

[('software', 0.808935821056366),
 ('program', 0.7681251764297485),
 ('medium', 0.7681076526641846),
 ('computing', 0.7648231983184814),
 ('firmware', 0.7577048540115356),
 ('machine-accessible', 0.7435612082481384),
 ('readable', 0.7297963500022888),
 ('processor', 0.7158001661300659),
 ('implemented', 0.7152796983718872),
 ('instructions', 0.7086238265037537)]

In [11]:
model.wv.most_similar(positive=['network'])

[('Internet', 0.8809741139411926),
 ('wired', 0.8569992780685425),
 ('local', 0.8550925254821777),
 ('communications', 0.8124428987503052),
 ('PC', 0.7985388040542603),
 ('computers', 0.7909102439880371),
 ('wireless', 0.7896413207054138),
 ('protocol', 0.7892076373100281),
 ('carrier', 0.7855661511421204),
 ('communication', 0.7788077592849731)]

In [13]:
model.wv.most_similar(positive=['phone'])

[('telephone', 0.8736276626586914),
 ('tablet', 0.8427409529685974),
 ('PDA', 0.7894008755683899),
 ('camera', 0.7807283997535706),
 ('phones', 0.7773498296737671),
 ('disc', 0.7703143954277039),
 ('PC', 0.7701486945152283),
 ('CD-ROM', 0.7691835761070251),
 ('tape', 0.7659013867378235),
 ('diskette', 0.7655481696128845)]

In [12]:
model.wv.most_similar(positive=['method'])

[('exemplary', 0.7581672072410583),
 ('further', 0.7566620707511902),
 ('procedure', 0.7311702370643616),
 ('purpose', 0.7186163067817688),
 ('architecture', 0.7134063243865967),
 ('automating', 0.7110243439674377),
 ('300', 0.7066522240638733),
 ('process', 0.7041938900947571),
 ('technique', 0.7028172612190247),
 ('controlling', 0.6997413039207458)]

In [14]:
model.wv.most_similar(positive=['TCP/IP'])

[('USB', 0.9584303498268127),
 ('workstations', 0.9379450678825378),
 ('Local', 0.934386134147644),
 ('CMD', 0.9338710904121399),
 ('radios', 0.9333294630050659),
 ('Telephone', 0.9323137402534485),
 ('Area', 0.9321101307868958),
 ('RTCP', 0.931330680847168),
 ('Networks', 0.9297774434089661),
 ('Unit', 0.9287588000297546)]

In [15]:
model.wv.most_similar(positive=['memory'])

[('address', 0.6125978827476501),
 ('storage', 0.6122298240661621),
 ('physical', 0.6072595119476318),
 ('addresses', 0.6007328033447266),
 ('virtualized', 0.592629075050354),
 ('read', 0.5909753441810608),
 ('write', 0.5892455577850342),
 ('204', 0.565220296382904),
 ('SNAPM', 0.5535624027252197),
 ('port', 0.5377624034881592)]

In [19]:
model.wv.most_similar(positive=['RAM'])

[('ROM', 0.9718466401100159),
 ('EPROM', 0.9349870681762695),
 ('read-only', 0.9224880337715149),
 ('flash', 0.9007031321525574),
 ('CD-ROM', 0.8933768272399902),
 ('random', 0.8903102874755859),
 ('modem', 0.86822110414505),
 ('170', 0.8648702502250671),
 ('EEPROM', 0.863433301448822),
 ('erasable', 0.8533413410186768)]

In [20]:
model.wv.most_similar(positive=['computer', 'smartphone'], negative=['network'])

[('COTS', 0.7464540600776672),
 ('executing', 0.7392984628677368),
 ('adapted', 0.7382043600082397),
 ('remotely', 0.7283940315246582),
 ('700', 0.7258792519569397),
 ('support', 0.7201268672943115),
 ('Vietnamese', 0.7163515090942383),
 ('physically', 0.7157812714576721),
 ('Hindi', 0.7150687575340271),
 ('analytics', 0.7129673957824707)]

In [22]:
model.wv.most_similar(positive=['.', ';'])

[('suppressing', 0.6286207437515259),
 ('rational', 0.5948602557182312),
 ('dyeing', 0.5867568850517273),
 ('Publication', 0.5850182175636292),
 ('examination', 0.5842567682266235),
 ('angles', 0.5789593458175659),
 ('A2', 0.5785780549049377),
 ('near', 0.5708717703819275),
 ('inherent', 0.570466935634613),
 ('sorts', 0.5686711668968201)]

In [23]:
model.wv['.']

array([ 0.14645183, -0.88413781, -0.08427817, -0.70994061,  0.27432841,
        0.13906902,  1.09764707,  0.14464587, -0.7989369 ,  0.44431078,
        0.50749767, -0.3057664 , -0.66854519, -0.51750523, -1.24868453,
        0.29888672, -0.99539739, -0.16994306,  1.21228981, -0.06382547,
       -0.26968187,  0.0783711 ,  0.30596015, -0.50161451,  0.67149734,
       -0.49142855,  0.05819498,  0.7170673 ,  0.07397896,  0.70932716,
        0.88001424, -0.05201377,  0.33719948,  1.48050964,  0.0558384 ,
        1.02984118,  0.13537061, -0.53974599, -0.25343663,  0.12290442,
        0.18904579, -1.12723589,  0.10540847,  0.6641826 , -1.05325246,
        0.7288596 , -0.58856255, -1.2173028 , -0.03364891, -0.87073344,
       -0.6921702 , -0.3142491 , -0.33011049,  0.43752089,  0.04724153,
       -0.70493829,  0.22155727, -0.11641531,  0.63251382,  0.42029899,
       -0.10483564,  0.06263605,  0.3923808 , -1.45434034, -0.22097151,
        0.64114952, -0.62414318,  0.71360415, -0.83703929,  0.67

### Observations

Removing reference numerals as a pre-processing step seems wise - they do not add to the syntactic or semantic understanding.

Lowering all words is also probably a good idea as is removing punctuation.