# Week 4 Linguistic Regularities in Word Embeddings
## Getting Started

In [1]:
from gensim.models import KeyedVectors
filename="../GoogleNews-vectors-negative300.bin"
mymodel = KeyedVectors.load_word2vec_format(filename,  binary=True)


In [2]:
mymodel.similarity('man', 'woman')

0.76640123

In [15]:
mymodel.most_similar(positive=['man'])

[('woman', 0.7664012312889099),
 ('boy', 0.6824870109558105),
 ('teenager', 0.6586930751800537),
 ('teenage_girl', 0.6147903203964233),
 ('girl', 0.5921714305877686),
 ('suspected_purse_snatcher', 0.571636438369751),
 ('robber', 0.5585119128227234),
 ('Robbery_suspect', 0.5584409236907959),
 ('teen_ager', 0.5549196004867554),
 ('men', 0.5489763021469116)]

In [4]:
mymodel['man']

array([ 0.32617188,  0.13085938,  0.03466797, -0.08300781,  0.08984375,
       -0.04125977, -0.19824219,  0.00689697,  0.14355469,  0.0019455 ,
        0.02880859, -0.25      , -0.08398438, -0.15136719, -0.10205078,
        0.04077148, -0.09765625,  0.05932617,  0.02978516, -0.10058594,
       -0.13085938,  0.001297  ,  0.02612305, -0.27148438,  0.06396484,
       -0.19140625, -0.078125  ,  0.25976562,  0.375     , -0.04541016,
        0.16210938,  0.13671875, -0.06396484, -0.02062988, -0.09667969,
        0.25390625,  0.24804688, -0.12695312,  0.07177734,  0.3203125 ,
        0.03149414, -0.03857422,  0.21191406, -0.00811768,  0.22265625,
       -0.13476562, -0.07617188,  0.01049805, -0.05175781,  0.03808594,
       -0.13378906,  0.125     ,  0.0559082 , -0.18261719,  0.08154297,
       -0.08447266, -0.07763672, -0.04345703,  0.08105469, -0.01092529,
        0.17480469,  0.30664062, -0.04321289, -0.01416016,  0.09082031,
       -0.00927734, -0.03442383, -0.11523438,  0.12451172, -0.02

In [6]:
mymodel.most_similar(positive=['China', 'London'], negative=['England'])

[('Beijing', 0.6737731695175171),
 ('Shanghai', 0.646628737449646),
 ('Beijng', 0.5856549739837646),
 ('Hong_Kong', 0.5709935426712036),
 ('Chinese', 0.5639771223068237),
 ('Guangdong', 0.5119545459747314),
 ('Shenzhen', 0.5102902054786682),
 ('Yanqi', 0.5076326727867126),
 ('Nanjing', 0.505686342716217),
 ('Guangzhou', 0.5043155550956726)]

In [5]:
import json
with open('relations.json', 'r') as fp:
    testtuples=json.load(fp)
testtuples


{'gram3-comparative': [['bad', 'worse'],
  ['big', 'bigger'],
  ['bright', 'brighter'],
  ['cheap', 'cheaper'],
  ['cold', 'colder'],
  ['cool', 'cooler'],
  ['deep', 'deeper'],
  ['easy', 'easier'],
  ['fast', 'faster'],
  ['good', 'better'],
  ['great', 'greater'],
  ['hard', 'harder'],
  ['heavy', 'heavier'],
  ['high', 'higher'],
  ['hot', 'hotter'],
  ['large', 'larger'],
  ['long', 'longer'],
  ['loud', 'louder'],
  ['low', 'lower'],
  ['new', 'newer'],
  ['old', 'older'],
  ['quick', 'quicker'],
  ['safe', 'safer'],
  ['sharp', 'sharper'],
  ['short', 'shorter'],
  ['simple', 'simpler'],
  ['slow', 'slower'],
  ['small', 'smaller'],
  ['smart', 'smarter'],
  ['strong', 'stronger'],
  ['tall', 'taller'],
  ['tight', 'tighter'],
  ['tough', 'tougher'],
  ['warm', 'warmer'],
  ['weak', 'weaker'],
  ['wide', 'wider'],
  ['young', 'younger']],
 'gram8-plural': [['banana', 'bananas'],
  ['bird', 'birds'],
  ['bottle', 'bottles'],
  ['building', 'buildings'],
  ['car', 'cars'],
  ['cat

## 2.1


In [24]:
def predict_capital(training_pair, countries):
    training_city = training_pair[0]
    training_country =  training_pair[1]
    return [[mymodel.most_similar(positive=[country, training_city], negative=[training_country])[0][0], country]
            for country in countries if country != training_country]

In [17]:
countries_tuples = testtuples['capital-common-countries']
countries = [c[1] for c in countries_tuples]
countries

['Greece',
 'Iraq',
 'Thailand',
 'China',
 'Germany',
 'Switzerland',
 'Egypt',
 'Australia',
 'Vietnam',
 'Cuba',
 'Finland',
 'Pakistan',
 'Afghanistan',
 'England',
 'Spain',
 'Russia',
 'Norway',
 'Canada',
 'France',
 'Italy',
 'Sweden',
 'Iran',
 'Japan']

In [21]:
training_tuple = ['London', 'England']
test_countries_tuples = [t for t in countries_tuples if t != training_tuple]
test_countries_tuples

[['Athens', 'Greece'],
 ['Baghdad', 'Iraq'],
 ['Bangkok', 'Thailand'],
 ['Beijing', 'China'],
 ['Berlin', 'Germany'],
 ['Bern', 'Switzerland'],
 ['Cairo', 'Egypt'],
 ['Canberra', 'Australia'],
 ['Hanoi', 'Vietnam'],
 ['Havana', 'Cuba'],
 ['Helsinki', 'Finland'],
 ['Islamabad', 'Pakistan'],
 ['Kabul', 'Afghanistan'],
 ['Madrid', 'Spain'],
 ['Moscow', 'Russia'],
 ['Oslo', 'Norway'],
 ['Ottawa', 'Canada'],
 ['Paris', 'France'],
 ['Rome', 'Italy'],
 ['Stockholm', 'Sweden'],
 ['Tehran', 'Iran'],
 ['Tokyo', 'Japan']]

In [22]:
test_countries = [c[1] for c in test_countries_tuples]

In [25]:
predicted_capitals = predict_capital(training_tuple, test_countries)
predicted_capitals



[['Athens', 'Greece'],
 ['Baghdad', 'Iraq'],
 ['Bangkok', 'Thailand'],
 ['Beijing', 'China'],
 ['Berlin', 'Germany'],
 ['Zurich', 'Switzerland'],
 ['Cairo', 'Egypt'],
 ['Sydney', 'Australia'],
 ['Hanoi', 'Vietnam'],
 ['Havana', 'Cuba'],
 ['Helsinki', 'Finland'],
 ['Islamabad', 'Pakistan'],
 ['Kabul', 'Afghanistan'],
 ['Madrid', 'Spain'],
 ['Moscow', 'Russia'],
 ['Oslo', 'Norway'],
 ['Toronto', 'Canada'],
 ['Paris', 'France'],
 ['Milan', 'Italy'],
 ['Stockholm', 'Sweden'],
 ['Tehran', 'Iran'],
 ['Tokyo', 'Japan']]

In [29]:
sum = 0
for pred, ans in zip(predicted_capitals, test_countries_tuples):
    if pred == ans:
        sum += 1
    else:
        print(pred, ans)
acc = sum/len(predicted_capitals)
acc

['Zurich', 'Switzerland'] ['Bern', 'Switzerland']
['Sydney', 'Australia'] ['Canberra', 'Australia']
['Toronto', 'Canada'] ['Ottawa', 'Canada']
['Milan', 'Italy'] ['Rome', 'Italy']


0.8181818181818182

In [54]:
mymodel.most_similar(positive=['Italy', 'London', 'Beijing'], negative=['England', 'China'])


[('Rome', 0.5747864246368408),
 ('Paris', 0.5640692710876465),
 ('Milan', 0.5366122722625732),
 ('Budapest', 0.5282354354858398),
 ('Brussels', 0.5023858547210693),
 ('Turin', 0.49624890089035034),
 ('Turin_Italy', 0.4889766573905945),
 ('Bucharest', 0.47618451714515686),
 ('Bologna_Italy', 0.4703238010406494),
 ('Via_Veneto', 0.47025877237319946)]

In [58]:
def predict_capital_improved(training_pairs, countries):
    training_cities = [training_pair[0] for training_pair in training_pairs]
    print(training_cities)
    training_countries =  [training_pair[1] for training_pair in training_pairs]
    print(training_countries)
    print(countries)
    return [[mymodel.most_similar(positive=[country]+training_cities, negative=training_countries)[0][0], country]
            for country in countries if country not in training_countries]

In [33]:
training_tuples = [['London', 'England'], ['Beijing', 'China']]
test_countries_tuples = [t for t in countries_tuples if t not in training_tuples]
test_countries_tuples

[['Athens', 'Greece'],
 ['Baghdad', 'Iraq'],
 ['Bangkok', 'Thailand'],
 ['Berlin', 'Germany'],
 ['Bern', 'Switzerland'],
 ['Cairo', 'Egypt'],
 ['Canberra', 'Australia'],
 ['Hanoi', 'Vietnam'],
 ['Havana', 'Cuba'],
 ['Helsinki', 'Finland'],
 ['Islamabad', 'Pakistan'],
 ['Kabul', 'Afghanistan'],
 ['Madrid', 'Spain'],
 ['Moscow', 'Russia'],
 ['Oslo', 'Norway'],
 ['Ottawa', 'Canada'],
 ['Paris', 'France'],
 ['Rome', 'Italy'],
 ['Stockholm', 'Sweden'],
 ['Tehran', 'Iran'],
 ['Tokyo', 'Japan']]

In [38]:
test_countries = [c[1] for c in test_countries_tuples if c not in training_tuples]
test_countries

['Greece',
 'Iraq',
 'Thailand',
 'Germany',
 'Switzerland',
 'Egypt',
 'Australia',
 'Vietnam',
 'Cuba',
 'Finland',
 'Pakistan',
 'Afghanistan',
 'Spain',
 'Russia',
 'Norway',
 'Canada',
 'France',
 'Italy',
 'Sweden',
 'Iran',
 'Japan']

In [59]:
predicted_capitals = predict_capital_improved(training_tuples, test_countries)
predicted_capitals


['London', 'Beijing']
['England', 'China']
['Greece', 'Iraq', 'Thailand', 'Germany', 'Switzerland', 'Egypt', 'Australia', 'Vietnam', 'Cuba', 'Finland', 'Pakistan', 'Afghanistan', 'Spain', 'Russia', 'Norway', 'Canada', 'France', 'Italy', 'Sweden', 'Iran', 'Japan']


[['Athens', 'Greece'],
 ['Baghdad', 'Iraq'],
 ['Bangkok', 'Thailand'],
 ['Berlin', 'Germany'],
 ['Zurich', 'Switzerland'],
 ['Cairo', 'Egypt'],
 ['Sydney', 'Australia'],
 ['Hanoi', 'Vietnam'],
 ['Havana', 'Cuba'],
 ['Helsinki', 'Finland'],
 ['Islamabad', 'Pakistan'],
 ['Kabul', 'Afghanistan'],
 ['Madrid', 'Spain'],
 ['Moscow', 'Russia'],
 ['Stockholm', 'Norway'],
 ['Toronto', 'Canada'],
 ['Paris', 'France'],
 ['Rome', 'Italy'],
 ['Stockholm', 'Sweden'],
 ['Tehran', 'Iran'],
 ['Tokyo', 'Japan']]