<div style="text-align: center; font-size: 24px;">Word Embeddings for Electrical Engineering</div>
<hr>

### Libraries

In [220]:
# !pip install wikipedia
# !pip install --upgrade gensim

In [221]:
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [222]:
import wikipedia

In [257]:
wikipedia.set_lang("en")

In [223]:
import gensim, logging
from gensim.parsing.preprocessing import remove_stopwords

In [224]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### Get corpus from Wikipedia

In [290]:
search_term = "power supply"

In [291]:
print(wikipedia.suggest(search_term))

None


In [294]:
wikipedia.search(search_term)

['Power supply',
 'Uninterruptible power supply',
 'Power supply unit (computer)',
 'Switched-mode power supply',
 'Welding power supply',
 'IC power-supply pin',
 'Regulated power supply',
 'Mains electricity',
 'AC adapter',
 'Electric power']

In [295]:
wikipedia.summary(search_term, sentences=3)

'A power supply is an electrical device that supplies electric power to an electrical load. The primary function of a power supply is to convert electric current from a source to the correct voltage, current, and frequency to power the load. As a result, power supplies are sometimes referred to as electric power converters.'

In [296]:
wikipedia.page(search_term).title

'Power supply'

In [297]:
wikipedia.page(search_term).url

'https://en.wikipedia.org/wiki/Power_supply'

In [298]:
wikipedia.page(search_term).content[:500] # full page text content

'A power supply is an electrical device that supplies electric power to an electrical load. The primary function of a power supply is to convert electric current from a source to the correct voltage, current, and frequency to power the load. As a result, power supplies are sometimes referred to as electric power converters. Some power supplies are separate standalone pieces of equipment, while others are built into the load appliances that they power. Examples of the latter include power supplies'

In [299]:
print(wikipedia.page(search_term).links) # links to other wikipedia pages

['AC/DC receiver', 'AC adapter', 'AC power supply', 'Alternating current', 'Alternator', 'Ampere', 'Arc welding', 'Arcing', 'Autotransformer', 'Battery (electricity)', 'Capacitive power supply', 'Capacitor', 'Cartesian coordinate system', 'Circuit breaker', 'Conduction (heat)', 'Consumer electronics', 'Convection', 'Crowbar (circuit)', 'Crystal growth', 'Current (electricity)', 'Current limiting', 'Current source', 'Desktop computer', 'Direct current', 'Domestic AC power plugs and sockets', 'Dummy load', 'Duty cycle', 'Electric current', 'Electric power', 'Electric power converter', 'Electric power grid', 'Electric vehicle', 'Electrical fault', 'Electrical generator', 'Electrical generators', 'Electrical load', 'Electrical outlet', 'Electricity generation', 'Electron microscope', 'Electronic filter', 'Electronic noise', 'Electrophoresis', 'Electrostatics', 'Energy storage', 'Focused ion beam', 'Foldback (power supply design)', 'Frequency', 'Fuel cell', 'Fuse (electrical)', 'Fusible lin

In [300]:
# wikipedia.summary("machine learning", sentences=3) # errors out. not sure why

In [301]:
# circuit_breaker.sections

### Train Word2Vec

Refer to following tutorial:
https://rare-technologies.com/word2vec-tutorial/

In particular, for how to train on datasets that don't fit in memory.

For now, we'll follow the simple example where data loaded in memory.

In [311]:
import re, string

pattern = re.compile('[^a-zA-Z0-9\., ]')

pages = [
    "Schneider Electric",
    "APC by Schneider Electric",
    "Schneider-Creusot",
    "SolarEdge",
    "Aveva",
    "Schneider Electric DMS",
    "Modbus",
    "Electric switchboard",
    "Circuit breaker",
    "Distribution board",
    "Electrical engineering",
    "power supply",
]

content = " ".join([wikipedia.page(page).content + " " for page in pages])

clean_content = pattern.sub(" " , content.lower().replace("\n", " ").replace(",", " "))
clean_content = re.sub(' +', ' ', clean_content) # remove double-spaces
clean_content = remove_stopwords(clean_content)

In [312]:
clean_content[:500]

'schneider electric european multinational company providing energy automation digital solutions efficiency sustainability. addresses homes buildings data centers infrastructure industries combining energy technologies real time automation software services. company operations 100 countries employs 135 000 people.schneider electric fortune global 500 company publicly traded euronext exchange component euro stoxx 50 stock market index. fy2019 company posted revenues 27.2 billion.schneider electric'

In [313]:
print(f"String length of clean_content: {len(clean_content)} characters.")

String length of clean_content: 121495 characters.


In [314]:
print(f"Number of words in clean_content: {len(clean_content.split(' '))} words.")

Number of words in clean_content: 15784 words.


In [315]:
print(f"Number of uniquewords in clean_content: {len(set(clean_content.split(' ')))} words.")

Number of uniquewords in clean_content: 4872 words.


In [316]:
sentences = [s.split(" ") for s in clean_content.split(".")]

In [317]:
print(len(sentences))

1368


In [318]:
model = gensim.models.Word2Vec(
    sentences,
    min_count=2, # min number of times word in corpus, otherwise ignored
    size=10, # number of layers in NN. Default=100
    workers=4, # for parallelization. Default=1. Requires Cython.
)

2020-11-11 11:42:41,493 : INFO : collecting all words and their counts
2020-11-11 11:42:41,494 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-11-11 11:42:41,499 : INFO : collected 4357 word types from a corpus of 17151 raw words and 1368 sentences
2020-11-11 11:42:41,500 : INFO : Loading a fresh vocabulary
2020-11-11 11:42:41,505 : INFO : effective_min_count=2 retains 2034 unique words (46% of original 4357, drops 2323)
2020-11-11 11:42:41,506 : INFO : effective_min_count=2 leaves 14828 word corpus (86% of original 17151, drops 2323)
2020-11-11 11:42:41,513 : INFO : deleting the raw counts dictionary of 4357 items
2020-11-11 11:42:41,513 : INFO : sample=0.001 downsamples 43 most-common words
2020-11-11 11:42:41,514 : INFO : downsampling leaves estimated 12482 word corpus (84.2% of prior 14828)
2020-11-11 11:42:41,519 : INFO : estimated required memory for 2034 words and 10 dimensions: 1179720 bytes
2020-11-11 11:42:41,520 : INFO : resetting layer weight

In [319]:
# model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
model.wv.most_similar("distribution")

2020-11-11 11:42:42,140 : INFO : precomputing L2-norms of word weight vectors


[('function', 0.9993293285369873),
 ('16', 0.999035120010376),
 ('systems', 0.9989442229270935),
 ('high', 0.9988926649093628),
 ('circuits', 0.9988582134246826),
 ('power', 0.9986993074417114),
 ('breaker', 0.9986612796783447),
 ('world', 0.998583197593689),
 ('', 0.9985527992248535),
 ('input', 0.998307466506958)]