# Gensim Word2Vec

Experimenting with gensim's word2vec function.

Gensim offers a function to calculate word embeddings. Let's give it a go.

The function requires a list of sentences (or an iterator that provides sentences). We have one of these in our patentdata classes.

In [1]:
# Imports and logging setup
from gensim.models import word2vec

import logging
import os
import pickle
import random

from patentdata.corpus import USPublications
# Probably need to move the patentcorpus.py file into the main patentdata directory
from patentdata.models.patentcorpus import LazyPatentCorpus, CorpusSentenceIterator

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load our list of G06 records
PIK = "G06records.data"

if os.path.isfile(PIK):
    with open(PIK, "rb") as f:
        print("Loading data")
        records = pickle.load(f)
        print("{0} records loaded".format(len(records)))
else:
    records = ds.get_records(["G", "06"])
    with open(PIK, "wb") as f:
        pickle.dump(records, f)
        
# Get data from 100 random descriptions across the data
records_random_sample = random.sample(records, 10000)
print("Random sample of {0} records".format(len(records_random_sample)))
print(records_random_sample[0:5])

path = '/media/SAMSUNG1/Patent_Downloads'
ds = USPublications(path)

Loading data
554570 records loaded
Random sample of 10000 records
[(3891025, '2014/I20141204.tar', 'I20141204/UTIL0358/US20140358848A1-20141204.ZIP'), (3782411, '2014/I20140717.tar', 'I20140717/UTIL0200/US20140200915A1-20140717.ZIP'), (251503, '2002/20020620.ZIP', '20020620/UTIL0077/US20020077867A1-20020620.ZIP'), (2932151, '2011/I20111222.tar', 'I20111222/UTIL0310/US20110310422A1-20111222.ZIP'), (2868636, '2011/I20111006.tar', 'I20111006/UTIL0246/US20110246890A1-20111006.ZIP')]


In [2]:
sentences = CorpusSentenceIterator(ds, records_random_sample)

In [3]:
model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)

fname = "word2vec_updated.gensim"
model.save(fname)

In [4]:
model.wv['computer']

array([ 6.18598127,  3.9890554 , -0.46839917,  3.66654658, -2.26690531,
        2.90214014, -0.31382436, -1.86979079,  1.96319115,  0.20540179,
       -3.0171833 , -2.37898612, -0.36646962,  0.93313956, -3.04202533,
       -0.88237894, -0.89379066, -1.6861428 , -1.8028084 , -3.30925918,
        1.05269563, -0.25386897, -0.86152971,  2.7096839 , -0.77251232,
       -1.00207055, -1.65914679, -0.42832085,  3.18684125,  2.52308846,
       -2.42135239, -3.45506144, -3.66133738,  2.74374819,  0.81268018,
       -1.79889572, -0.64459193, -1.60908389, -2.17802167,  2.12680173,
        1.33405232,  2.11808944, -2.80494618, -2.380862  , -2.90254474,
        1.74873555, -4.16749001, -1.5162555 , -0.67727876,  1.34798527,
       -4.29174948, -0.03798132,  1.87599897, -0.30561879,  1.54951894,
        2.23940778,  0.87426442,  1.16053581,  5.36065149, -0.49637595,
       -0.05947032,  3.52158356,  0.18848637, -3.51641273, -5.15498924,
       -2.35806489,  2.78621316, -0.10848203, -2.40141487,  2.34

In [5]:
model.wv.similarity("computer", "device")

0.49874192667440542

In [6]:
model.wv.similarity("method", "five")

-0.17368886824397356

In [7]:
model.wv.most_similar(positive=['computer'])

[('computers', 0.671457827091217),
 ('computing', 0.6075847744941711),
 ('processor', 0.6024589538574219),
 ('machine', 0.5845490097999573),
 ('software', 0.5789114236831665),
 ('pc', 0.5552119612693787),
 ('operating', 0.5532264113426208),
 ('medium', 0.5397846102714539),
 ('instructions', 0.4993794560432434),
 ('device', 0.49874192476272583)]

In [8]:
model.wv.most_similar(positive=['network'])

[('networks', 0.7367112040519714),
 ('internet', 0.7240222692489624),
 ('lan', 0.6588615775108337),
 ('communications', 0.6554214954376221),
 ('wireless', 0.6519482135772705),
 ('networking', 0.6365801095962524),
 ('infrastructure', 0.5901362895965576),
 ('communication', 0.5864831805229187),
 ('gateway', 0.5783470869064331),
 ('modem', 0.5718026161193848)]

In [9]:
model.wv.most_similar(positive=['phone'])

[('telephone', 0.8898583054542542),
 ('phones', 0.6942756175994873),
 ('telephones', 0.6722859144210815),
 ('handset', 0.6473312377929688),
 ('msisdn', 0.610964834690094),
 ('handsets', 0.5734683275222778),
 ('terminal', 0.5710985660552979),
 ('imei', 0.5617379546165466),
 ('treo', 0.5552546977996826),
 ('dialing', 0.5484899282455444)]

In [10]:
model.wv.most_similar(positive=['method'])

[('technique', 0.6702439785003662),
 ('methodology', 0.6431148648262024),
 ('methods', 0.6230913400650024),
 ('process', 0.6001814603805542),
 ('procedure', 0.589292585849762),
 ('means', 0.551599383354187),
 ('apparatus', 0.5487009882926941),
 ('mechanism', 0.5267924666404724),
 ('system', 0.49841129779815674),
 ('scheme', 0.46753552556037903)]

In [11]:
model.wv.most_similar(positive=['TCP/IP'])

KeyError: "word 'TCP/IP' not in vocabulary"

In [12]:
model.wv.most_similar(positive=['memory'])

[('storage', 0.7681612968444824),
 ('ram', 0.7249451279640198),
 ('dram', 0.7066590189933777),
 ('memories', 0.6994853019714355),
 ('disk', 0.6648283004760742),
 ('cache', 0.6513200998306274),
 ('sram', 0.6434590220451355),
 ('ssd', 0.5988369584083557),
 ('nand', 0.5959087610244751),
 ('buffer', 0.5841000080108643)]

In [14]:
model.wv.most_similar(positive=['ram'])

[('rom', 0.7914084196090698),
 ('memory', 0.7249451875686646),
 ('sram', 0.6853675842285156),
 ('nvram', 0.6719946265220642),
 ('dram', 0.663571834564209),
 ('fram', 0.6442307233810425),
 ('pram', 0.5991209745407104),
 ('memories', 0.5990630984306335),
 ('flash', 0.5886524319648743),
 ('rram', 0.5879623889923096)]

In [15]:
model.wv.most_similar(positive=['computer', 'smartphone'], negative=['network'])

[('pc', 0.5312662124633789),
 ('ipad', 0.4916309118270874),
 ('tablet', 0.4893582761287689),
 ('treo', 0.4776890277862549),
 ('pda', 0.4631950855255127),
 ('palm', 0.45669957995414734),
 ('zen', 0.4544898271560669),
 ('portrait', 0.43992456793785095),
 ('asus', 0.4397648870944977),
 ('netbook', 0.43885114789009094)]

In [16]:
model.wv.most_similar(positive=['.', ';'])

KeyError: "word '.' not in vocabulary"

In [None]:
model.wv['.']

### Observations

Removing reference numerals as a pre-processing step seems wise - they do not add to the syntactic or semantic understanding.

Lowering all words is also probably a good idea as is removing punctuation.