In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import os
import collections
import itertools

In [2]:
def preprocess(text):
    
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer('\w+')
    
    # lower case and remove leading/trailing spaces
    text = text.lower().strip()
    
    tokens = []
    
    for token in text.split():
        # skip links, user handles and stopwords
        if token.startswith('https://') \
        or token.startswith('@') \
        or token in stopwords.words('english'):
            continue
            
        tokens.append(token)
    
    # tokenize the text
    tokens = tokenizer.tokenize(' '.join(tokens))

    # lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [3]:
def load_data():
    path = '../tweets/en/all.csv'
    
    tweets = []

    with open(path, 'r', newline='\r\n') as f:
        for i, line in enumerate(f.readlines()):
            n, tweet = line.split(',', maxsplit=1)
            tokens = preprocess(tweet)

            tweets.append(tokens)
            
    return tweets

In [4]:
# tweets = load_data()

In [5]:
model_path = '../models/en/'
model_name_template = 'model-sg={}-size={}-window={}-mincount={}-downsampling=no-{}'

In [6]:
architectures = [0, 1] # CBOW or skip-gram

# embedding output dimension
sizes = [50, 100, 200]

# it is mentioned on slides that for CBOW window is usually 5 and
# for skip-gram 10 so let's try some values around those numbers
#
# access using windows[sg]
windows = [[2, 5, 8], [5, 10, 15]]

min_counts = [5, 50, 100]


params = {
    'architectures' : architectures,
    'sizes' : sizes,
    'windows' : windows,
    'min_counts' : min_counts
}

In [7]:
def get_models(params, n=1, force_train=False):

    # boolean indicating whether to train or load from file
    # NOTE: this is a poor check
    train = not os.listdir(model_path) or force_train
    
    # list of tuples [(params, model), ...]
    models = []
    
    for sg in params['architectures']:
        for size in params['sizes']:
            for window in params['windows'][sg]:
                for min_count in params['min_counts']:
                        
                    param_tuple = (sg, size, window, min_count)

                    if train:
                        model = Word2Vec(sentences=tweets, size=size, window=window, 
                                         min_count=min_count, sg=sg)
                        model.save(model_path + \
                                   model_name_template.format(sg, size, window, min_count, n))
                    else:
                        model = Word2Vec.load(model_path + \
                                              model_name_template.format(sg, size, window, min_count, n))

                    models.append((param_tuple, model))
    return models

In [8]:
president_country = [
    ('russia', 'putin'),
    ('america', 'trump'),
    ('britain', 'may'),
    ('syria', 'assad'),
    ('germany', 'merkel'),
    ('france', 'macron'),
    ('japan', 'abe'),
    ('turkey', 'erdogan')
]

combinations_pres_country = [pair for pair in itertools.combinations(president_country, r=2)]

In [34]:
def evaluateModel(m, debug=0):
    score = 0
    count = 0
    for equation in combinations_pres_country:
        
        a = equation[0][0]
        b = equation[1][1]
        c = equation[0][1]
        d = equation[1][0]
        
        if a in m.wv.vocab and b in m.wv.vocab and c in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[a, b], negative=[c])):
                if d == x[0]:
                    score += 1.0/(i + 1)
                    count += 1
                    if debug == 1:
                        print("Found %s for relation %s -> %s and %s -> ? in position %d"%(d, c, a, b, i + 1))
        
        if c in m.wv.vocab and d in m.wv.vocab and b in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[c, d], negative=[b])):
                if a == x[0]:
                    score += 1.0/(i + 1)
                    count += 1
                    if debug == 1:
                        print("Found %s for relation %s -> %s and %s -> ? in position %d"%(a, b, d, c, i + 1))

        if a in m.wv.vocab and c in m.wv.vocab and d in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[c, d], negative=[a])):
                if b == x[0]:
                    score += 1.0/(i + 1)
                    count += 1
                    if debug == 1:
                        print("Found %s for relation %s -> %s and %s -> ? in position %d"%(b, a, c, d, i + 1))
        
        if a in m.wv.vocab and b in m.wv.vocab and d in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[a, b], negative=[d])):
                if c == x[0]:
                    score += 1.0/(i + 1)
                    count += 1
                    if debug == 1:
                        print("Found %s for relation %s -> %s and %s -> ? in position %d"%(c, d, b, a, i + 1))
                    
    return score, count

In [10]:
n_train = 4

modelss = []

for n in range(n_train):
    print('n={}...'.format(n), end=' ')
    models = get_models(params, n, force_train=False)
    modelss.append(models)
    print('...done!')

n=0... ...done!
n=1... ...done!
n=2... ...done!
n=3... ...done!


In [11]:
print(len(modelss[0]))

54


In [40]:
scores = {}

# get mean score of n_train models
for model_list in modelss:
    for param_tuple, model in model_list:
        score, count = evaluateModel(model)
        
        score_ = (1 / n_train) * score
        count_ = (1 / n_train) * count
        
        
        if param_tuple in scores.keys():
            old_score, old_count = scores[param_tuple]
            scores[param_tuple] = (old_score + score_, old_count + count_)
        else:
            scores[param_tuple] = (score_, count_)

In [41]:
scores_sorted = sorted(scores.items(), key=lambda x: x[1], reverse=True)

print('best embeddings:')
for a, b in scores_sorted[:5]:
    print(a, b)
    
print()

print('worst embeddings:')
for a, b in scores_sorted[-5:]:
    print(a, b)

best embeddings:
(1, 50, 15, 50) (40.30357142857143, 63.25)
(1, 50, 10, 50) (35.026984126984125, 62.75)
(1, 50, 15, 5) (28.337599206349207, 45.0)
(1, 100, 15, 50) (26.975297619047616, 49.75)
(1, 50, 10, 5) (26.80297619047619, 38.75)

worst embeddings:
(0, 50, 2, 5) (1.9229166666666666, 7.5)
(0, 200, 5, 100) (1.888690476190476, 6.75)
(0, 200, 2, 100) (1.8464285714285715, 6.5)
(0, 100, 2, 100) (1.6033730158730157, 6.25)
(0, 50, 2, 100) (0.6464285714285714, 3.75)


In [15]:
def paramstuple2model(params):
    sg, size, window, min_count = params
    
    if sg == 0:
        windows = [[window],[]]
    else:
        windows = [[], [window]]

    params_dict = dict(architectures=[sg], sizes=[size], windows=windows, min_counts=[min_count])
    
    return get_models(params_dict)


In [16]:
best_model = paramstuple2model(scores_sorted[0][0])[0][1]
s1, s2 = evaluateModel(best_model, 1)

s1 + s2 # total score

Found russia for relation trump -> america and putin -> ? in position 9
Found russia for relation may -> britain and putin -> ? in position 4
Found putin for relation britain -> may and russia -> ? in position 7
Found russia for relation assad -> syria and putin -> ? in position 1
Found assad for relation russia -> putin and syria -> ? in position 10
Found germany for relation putin -> russia and merkel -> ? in position 4
Found russia for relation merkel -> germany and putin -> ? in position 1
Found putin for relation germany -> merkel and russia -> ? in position 3
Found france for relation putin -> russia and macron -> ? in position 1
Found russia for relation macron -> france and putin -> ? in position 1
Found macron for relation russia -> putin and france -> ? in position 1
Found putin for relation france -> macron and russia -> ? in position 1
Found japan for relation putin -> russia and abe -> ? in position 1
Found russia for relation abe -> japan and putin -> ? in position 1
Foun

39.80714285714285

In [18]:
keywords = ['syria', 'damascus', 'assad', 'politics', 'putin', 'trump', 'russia', 'usa', 'skripal', 'uk']

In [19]:
# best model according to president-country metric
best_models = [paramstuple2model(params)[0][1] for params, score in scores_sorted[:3]]

for i, model in enumerate(best_models):
    
    xs = [[word for word, _ in model.wv.most_similar(keyword)] for keyword in keywords]

    for a in (list(zip(*xs))):
        print(a)
        
    print('----------------------------')

('west', 'capital', 'syrian', 'religion', 'russia', 'republican', 'russian', '99', 'bz', 'france')
('syriaairstrikes', 'rubble', 'punish', 'political', 'buddy', 'one', 'putin', 'mexico', 'swiss', 'britain')
('syriastrike', 'raw', 'syria', 'age', 'again', 'impeachment', 'kremlin', '8', 'salisbury', 'theresamay')
('punish', 'square', 'civilian', 'relevant', 'russian', 'potus', 'u', 'com', 'poisoning', 'accuses')
('assad', 'yesterday', 'weapon', 'idea', 'vlad', 'president', 'coming', 'free', 'produced', 'k')
('u', 'night', 'chemical', 'platform', 'puppet', 'mueller', 'syria', 'hot', 'motive', 'fr')
('syrian', '14', 'basically', 'character', 'vladimir', 'democrat', 'diplomat', 'spain', 'poisoned', 'skripal')
('chemicalweapons', 'precise', 'attack', 'mix', 'kremlin', 'even', 'prepared', 'worldwide', 'uk', 'boris')
('vow', 'explosion', 'regime', 'college', 'would', 'him', 'prepare', 'drama', 'evidence', 'salisbury')
('global', 'playlist', 'butcher', 'topic', 'west', 'gop', 'ending', 'sport',

In [20]:
# worst models according to president-country metric
worst_models = [paramstuple2model(params)[0][1] for params, score in scores_sorted[-3:]]

for i, model in enumerate(worst_models):
    
    xs = [[word for word, _ in model.wv.most_similar(keyword)] for keyword in keywords]

    for a in (list(zip(*xs))):
        print(a)
        
    print('----------------------------')

('coalition', 'wall', 'ability', 'liberal', 'russian', 'realdonaldtrump', 'israel', 'home', 'produced', 'britain')
('diplomacy', 'explosion', 'change', 'folk', 'russia', 'losing', 'russian', 'mexico', 'nerve', 'k')
('planned', 'city', 'isi', 'respect', 'monster', 'obama', 'hezbollah', 'free', 'agent', 'england')
('condemn', 'empty', 'iranian', 'leftist', 'relationship', 'j', 'turkey', 'australia', 'deadly', 'ally')
('following', 'capital', 'islam', 'business', 'donald', 'elected', 'nuke', 'car', 'poisoning', 'germany')
('tn', 'building', 'however', 'crazy', 'enemy', 'graham', 'north', 'canada', 'alleged', 'london')
('response', 'area', 'brutal', 'game', 'ally', 'idiot', 'korea', 'service', 'suspected', 'intelligence')
('dropping', 'yesterday', 'him', 'amazing', 'israel', 'potus', 'economic', 'africa', 'confirmed', 'push')
('allied', 'ghouta', 'continue', 'ur', 'iran', 'him', 'phone', 'driver', 'cw', 'british')
('empty', 'future', 'dictator', 'political', 'master', 'joe', 'saudi', 'eu',

In [21]:
eval_path = '/work/courses/unix/T/ELEC/E5550/data/eval/analogical_reasoning_questions-words.txt'

In [22]:
combinations = []

with open(eval_path, 'r') as f:
    for line in f.readlines():
        line = line.strip()
        
        if line.startswith(':'):
            continue
        else:
            a, b, c, d = line.split()
            combinations.append(((a, b), (c, d)))
            
print(len(combinations))

19544


In [23]:
def evaluateModel2(m, combinations):
    found = 0
    
    for equation in combinations:
        
        a = equation[0][0]
        b = equation[1][1]
        c = equation[0][1]
        d = equation[1][0]
        
        if a in m.wv.vocab and b in m.wv.vocab and c in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[a, b], negative=[c])):
                if d == x[0]:
                    found += 1
        
        if c in m.wv.vocab and d in m.wv.vocab and b in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[c, d], negative=[b])):
                if a == x[0]:
                    found += 1

        if a in m.wv.vocab and c in m.wv.vocab and d in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[c, d], negative=[a])):
                if b == x[0]:
                    found += 1
                    
        if a in m.wv.vocab and b in m.wv.vocab and d in m.wv.vocab:
            for i, x in enumerate(m.wv.most_similar(positive=[a, b], negative=[d])):
                if c == x[0]:
                    found += 1
                    
    return found

In [28]:
scores_eval = {}

# get mean score of n_train models
for model_list in modelss[:1]:
    for param_tuple, model in model_list:
        score = evaluateModel2(model, combinations)
            
        scores[param_tuple] = score

In [25]:
scores_eval_sorted = sorted(scores.items(), key=lambda x: x[1], reverse=True)
print(scores_eval_sorted)

[((1, 100, 5, 5), 1008), ((1, 200, 5, 5), 976), ((1, 50, 5, 5), 952), ((1, 200, 10, 50), 946), ((1, 100, 5, 50), 938), ((1, 200, 5, 50), 900), ((0, 200, 2, 5), 874), ((0, 50, 2, 5), 834), ((1, 100, 10, 50), 830), ((1, 50, 5, 50), 782), ((0, 100, 2, 5), 774), ((1, 50, 10, 5), 774), ((1, 200, 15, 50), 768), ((1, 50, 10, 50), 748), ((0, 200, 5, 5), 746), ((1, 100, 15, 50), 740), ((0, 50, 8, 5), 734), ((0, 50, 2, 50), 722), ((0, 100, 5, 5), 722), ((0, 100, 8, 5), 712), ((1, 50, 15, 50), 712), ((0, 50, 5, 5), 694), ((1, 100, 10, 5), 672), ((0, 200, 8, 5), 652), ((1, 50, 15, 5), 644), ((0, 200, 2, 50), 640), ((0, 100, 2, 50), 636), ((0, 200, 5, 50), 616), ((1, 200, 10, 5), 612), ((1, 100, 5, 100), 604), ((1, 200, 15, 5), 600), ((1, 200, 10, 100), 594), ((0, 100, 5, 50), 566), ((1, 100, 10, 100), 544), ((1, 100, 15, 5), 544), ((0, 50, 5, 50), 542), ((1, 200, 5, 100), 542), ((0, 100, 8, 50), 528), ((1, 50, 5, 100), 526), ((0, 50, 8, 50), 504), ((1, 200, 15, 100), 502), ((0, 100, 2, 100), 488),

In [26]:
best_model = paramstuple2model(scores_eval_sorted[0][0])[0][1]

xs = [[word for word, _ in best_model.wv.most_similar(keyword)] for keyword in keywords]

for a in (list(zip(*xs))):
    print(a)

('syriaairstrikes', 'capital', 'asad', 'religion', 'vlad', 'duh', 'kremlin', 'austria', 'novichok', 'britain')
('provoke', 'yesterday', 'inhumane', 'gender', 'blackmail', 'drumpf', 'affect', 'bangladesh', 'skripals', 'waging')
('039', 'city', 'killer', 'race', 'vendetta', 'bigly', 'directly', 'argentina', 'bz', 'britian')
('punish', 'gathered', 'assertion', 'appreciate', 'buddy', 'clown', 'trigger', 'denmark', 'toxin', 'france')
('assadmustgo', 'triple', 'assads', 'identity', 'chump', 'deflection', 'russian', 'america', 'substance', 'truthnation')
('unitedstates', 'saa', 'punish', 'perspective', 'vladimir', 'thehill', 'reutersus', 'bride', 'salisbury', 'k')
('reutersus', 'jolted', 'insist', 'opinion', 'puppet', 'wh', 'grows', 'brazil', 'swiss', 'stopthewar')
('middleeast', 'decry', 'asaad', 'feeling', 'betray', 'jeff', 'careful', 'sauce', 'poisoned', 'unitedkingdom')
('fukus', 'syrianstrike', 'further', 'culture', 'spineless', 'reviewed', 'diplomat', 'malaysia', 'irrefutable', 'fra')
(

In [33]:
best_model.wv.most_similar('cat')

[('mouse', 0.8952155113220215),
 ('pursuit', 0.7874118685722351),
 ('hunted', 0.7774225473403931),
 ('navy', 0.7462985515594482),
 ('submarine', 0.7173159718513489),
 ('royal', 0.6956762075424194),
 ('hub', 0.6869996190071106),
 ('sub', 0.6750925779342651),
 ('download', 0.6705496311187744),
 ('webcam', 0.6640481948852539)]

In [27]:
worst_model = paramstuple2model(scores_eval_sorted[-1][0])[0][1]

xs = [[word for word, _ in worst_model.wv.most_similar(keyword)] for keyword in keywords]

for a in (list(zip(*xs))):
    print(a)

('syrian', 'area', 'ability', 'self', 'russian', 'realdonaldtrump', 'russian', 'africa', 'nerve', 'britain')
('response', 'capital', 'syrian', 'crazy', 'russia', 'losing', 'israel', 'free', 'spy', 'k')
('syriastrike', 'near', 'change', 'folk', 'option', 'bos', 'korea', 'mexico', 'agent', 'england')
('ally', 'explosion', 'capability', 'working', 'again', 'gowdy', 'option', 'canada', 'confirmed', 'germany')
('nato', 'casualty', 'civilian', 'piece', 'buddy', 'potus', 'north', 'australia', 'poisoning', 'british')
('coalition', 'building', 'rebel', 'religion', 'promise', 'distract', 'threat', 'share', 'claim', 'poisoning')
('condemn', 'hit', 'also', 'liberal', 'backing', 'congressional', 'saudi', 'product', 'alleged', 'push')
('hezbollah', 'sent', 'response', 'mind', 'master', 'mob', 'nuke', 'sport', 'suspected', 'ally')
('syriacrisis', 'equipment', 'iranian', 'low', 'enemy', 'democrat', 'turkey', 'london', 'independent', 'recent')
('capability', 'empty', 'so', 'reading', 'everything', 'par