### Word2Vec
Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality'

## Pre-trained Word2Vec model

In [1]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [2]:
#Download GoogleNews-vectors-negative300.bin from 
# https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
# OR
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g   ( Use this link )


PreTrainedModel = KeyedVectors.load_word2vec_format(fname="Files/GoogleNews-vectors-negative300.bin", binary=True)

# model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',binary=True,limit=500000)

In [3]:
PreTrainedModel['cricket']

array([-3.67187500e-01, -1.21582031e-01,  2.85156250e-01,  8.15429688e-02,
        3.19824219e-02, -3.19824219e-02,  1.34765625e-01, -2.73437500e-01,
        9.46044922e-03, -1.07421875e-01,  2.48046875e-01, -6.05468750e-01,
        5.02929688e-02,  2.98828125e-01,  9.57031250e-02,  1.39648438e-01,
       -5.41992188e-02,  2.91015625e-01,  2.85156250e-01,  1.51367188e-01,
       -2.89062500e-01, -3.46679688e-02,  1.81884766e-02, -3.92578125e-01,
        2.46093750e-01,  2.51953125e-01, -9.86328125e-02,  3.22265625e-01,
        4.49218750e-01, -1.36718750e-01, -2.34375000e-01,  4.12597656e-02,
       -2.15820312e-01,  1.69921875e-01,  2.56347656e-02,  1.50146484e-02,
       -3.75976562e-02,  6.95800781e-03,  4.00390625e-01,  2.09960938e-01,
        1.17675781e-01, -4.19921875e-02,  2.34375000e-01,  2.03125000e-01,
       -1.86523438e-01, -2.46093750e-01,  3.12500000e-01, -2.59765625e-01,
       -1.06933594e-01,  1.04003906e-01, -1.79687500e-01,  5.71289062e-02,
       -7.41577148e-03, -

In [5]:
PreTrainedModel['man'].shape

(300,)

In [6]:
PreTrainedModel.most_similar('man')

[('woman', 0.7664012908935547),
 ('boy', 0.6824870705604553),
 ('teenager', 0.6586930155754089),
 ('teenage_girl', 0.6147903203964233),
 ('girl', 0.5921714305877686),
 ('suspected_purse_snatcher', 0.5716364979743958),
 ('robber', 0.5585118532180786),
 ('Robbery_suspect', 0.5584409832954407),
 ('teen_ager', 0.5549197196960449),
 ('men', 0.5489763021469116)]

In [9]:
dir(PreTrainedModel)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 '_upconvert_old_d2vkv',
 '_upconvert_old_vocab',
 'add_lifecycle_event',
 'add_vector',
 'add_vectors',
 'allocate_vecattrs',
 'closer_than',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'expandos',
 'fill_norms',
 'get_index',
 'get_mean_vector',
 'get_normed_vectors',
 'get_vecattr',
 'get_vector',
 'has_index_for',
 'index2entity',
 'index2word',
 'index_to_key',
 'init_sims',
 

In [11]:
PreTrainedModel.most_similar("cricket")

[('cricketing', 0.8372225761413574),
 ('cricketers', 0.8165745139122009),
 ('Test_cricket', 0.8094819784164429),
 ('Twenty##_cricket', 0.8068487644195557),
 ('Twenty##', 0.7624265551567078),
 ('Cricket', 0.7541396617889404),
 ('cricketer', 0.7372579574584961),
 ('twenty##', 0.7316356897354126),
 ('T##_cricket', 0.7304614782333374),
 ('West_Indies_cricket', 0.6987985372543335)]

In [12]:
PreTrainedModel.most_similar("facebook")

[('Facebook', 0.7563532590866089),
 ('FaceBook', 0.7076998949050903),
 ('twitter', 0.6988552212715149),
 ('myspace', 0.6941817402839661),
 ('Twitter', 0.664244532585144),
 ('twitter_facebook', 0.6572229862213135),
 ('Facebook.com', 0.6529867649078369),
 ('myspace_facebook', 0.6370643973350525),
 ('facebook_twitter', 0.6367617845535278),
 ('linkedin', 0.6356593370437622)]

In [13]:
PreTrainedModel.similarity(w1='man', w2='woman')

0.76640123

In [14]:
PreTrainedModel.similarity(w1='man', w2='Python')

0.03549298

In [15]:
PreTrainedModel.doesnt_match(words=['PHP','Java','monkey'])

'monkey'

In [16]:
vec = PreTrainedModel['king'] - PreTrainedModel['man'] + PreTrainedModel['woman']

PreTrainedModel.most_similar([vec])

[('king', 0.844939112663269),
 ('queen', 0.7300516366958618),
 ('monarch', 0.6454660296440125),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676352500916),
 ('prince', 0.577711820602417),
 ('kings', 0.5613663792610168),
 ('sultan', 0.5376777052879333),
 ('Queen_Consort', 0.5344247817993164),
 ('queens', 0.5289887189865112)]

In [17]:
vec2 = PreTrainedModel['INR'] - PreTrainedModel ['India'] + PreTrainedModel['England']

PreTrainedModel.most_similar([vec2])

[('INR', 0.6442341208457947),
 ('GBP', 0.5040826201438904),
 ('£_##.###m', 0.4540838301181793),
 ('England', 0.44649261236190796),
 ('£', 0.43341004848480225),
 ('Â_£', 0.4307197630405426),
 ('stg###', 0.42992621660232544),
 ('£_#.##m', 0.4256129860877991),
 ('Pounds_Sterling', 0.42512619495391846),
 ('GBP##', 0.42464491724967957)]