In [1]:
import numpy as np
import pandas as pd
import torch
import sys

from sentence_transformers import SentenceTransformer

In [2]:
print('Device : ',torch.cuda.get_device_name(0))

Device :  GeForce GTX 1060 6GB


In [2]:
roberta = SentenceTransformer('stsb-roberta-base-v2')

In [3]:
mpnet = SentenceTransformer('stsb-mpnet-base-v2')

In [4]:
bert = SentenceTransformer('stsb-bert-base')

In [5]:
contexts = ['My favorite city is ', 'He lives in ', 'She moved to ']
short_ctxts = ['fav','lives','moved']

In [23]:
cities = pd.read_csv('./csv/worldcitiespop.csv',header=0,dtype={'AccentCity':'str', 'Region':'object'}).dropna().drop('Region',axis=1)
cities = cities[cities.Population>100000].reset_index(drop=True)

In [14]:
cities

Unnamed: 0,Country,City,AccentCity,Population,Latitude,Longitude
0,ae,abu dhabi,Abu Dhabi,603687.0,24.466667,54.366667
1,ae,dubai,Dubai,1137376.0,25.258172,55.304717
2,ae,sharjah,Sharjah,543942.0,25.357310,55.403304
3,af,baglan,Baglan,108481.0,36.130684,68.708286
4,af,gardez,Gardez,103732.0,33.597439,69.225922
...,...,...,...,...,...,...
3522,zw,gweru,Gweru,201879.0,-19.450000,29.816667
3523,zw,harare,Harare,2213701.0,-17.817778,31.044722
3524,zw,kadoma,Kadoma,100276.0,-18.350000,29.916667
3525,zw,kwekwe,Kwekwe,116332.0,-18.916667,29.816667


In [24]:
for i, ctxt in enumerate(contexts):
    bert_arr=np.empty( (len(cities),768) )
    roberta_arr=np.empty( (len(cities),768) )
    mpnet_arr=np.empty( (len(cities),768) )
    
    for ind in range(len(cities)):
        city = cities.loc[ind]
        name = city.AccentCity
        lat = city.Latitude
        long = city.Longitude
        
        bert_out = bert.encode(ctxt+name)
        rob_out = roberta.encode(ctxt+name)
        mp_out = mpnet.encode(ctxt+name)
        
        bert_arr[ind,:] = bert_out
        roberta_arr[ind,:] = rob_out
        mpnet_arr[ind,:] = mp_out
        
        print("\r context {} : {}%".format(i,np.round((ind+1)/(len(cities))*100),4), end="")
        sys.stdout.flush()
        
    with open('embd_files/bert_'+ short_ctxts[i] + '.npy','wb') as f:
        np.save(file=f,arr=bert_arr)
    with open('embd_files/roberta_'+ short_ctxts[i] + '.npy','wb') as f:
        np.save(file=f,arr=roberta_arr)
    with open('embd_files/mpnet_'+ short_ctxts[i] + '.npy','wb') as f:
        np.save(file=f,arr=mpnet_arr)

 context 2 : 100.0%

In [8]:
capitals = pd.read_csv('./csv/country-capitals.csv').drop(['ContinentName','CountryCode'], axis=1)
capitals

Unnamed: 0,CountryName,CapitalName,Latitude,Longitude
0,Somaliland,Hargeisa,9.550000,44.050000
1,South Georgia and South Sandwich Islands,King Edward Point,-54.283333,-36.500000
2,French Southern and Antarctic Lands,Port-aux-Français,-49.350000,70.216667
3,Palestine,Jerusalem,31.766667,35.233333
4,Aland Islands,Mariehamn,60.116667,19.900000
...,...,...,...,...
237,Zimbabwe,Harare,-17.816667,31.033333
238,Northern Cyprus,North Nicosia,35.183333,33.366667
239,Hong Kong,Hong Kong,22.302711,114.177216
240,British Indian Ocean Territory,Diego Garcia,-7.300000,72.400000


In [10]:
for i, ctxt in enumerate(contexts):
    bert_arr=np.empty( (len(capitals),768) )
    roberta_arr=np.empty( (len(capitals),768) )
    mpnet_arr=np.empty( (len(capitals),768) )
    
    for ind in range(len(capitals)):
        city = capitals.loc[ind]
        name = city.CapitalName
        lat = city.Latitude
        long = city.Longitude
        
        bert_out = bert.encode(ctxt+name)
        rob_out = roberta.encode(ctxt+name)
        mp_out = mpnet.encode(ctxt+name)
        
        bert_arr[ind,:] = bert_out
        roberta_arr[ind,:] = rob_out
        mpnet_arr[ind,:] = mp_out
        
        print("\r context {} : {}%".format(i,np.round((ind+1)/(len(capitals))*100),4), end="")
        sys.stdout.flush()
        
    with open('embd_files/bert_capitals_'+ short_ctxts[i] + '.npy','wb') as f:
        np.save(file=f,arr=bert_arr)
    with open('embd_files/roberta_capitals_'+ short_ctxts[i] + '.npy','wb') as f:
        np.save(file=f,arr=roberta_arr)
    with open('embd_files/mpnet_capitals_'+ short_ctxts[i] + '.npy','wb') as f:
        np.save(file=f,arr=mpnet_arr)

 context 2 : 100.0%

In [6]:
countries = pd.read_csv('./csv/countries.csv')
countries

Unnamed: 0,Name,Code,Latitude,Longitude
0,Afghanistan,AF,33.0000,65.0
1,Åland Islands,AX,60.1500,20.0
2,Albania,AL,41.0000,20.0
3,Algeria,DZ,28.0000,3.0
4,American Samoa,AS,-14.3333,-170.0
...,...,...,...,...
244,Wallis and Futuna,WF,-13.3000,-176.2
245,Western Sahara,EH,24.5000,-13.0
246,Yemen,YE,15.0000,48.0
247,Zambia,ZM,-15.0000,30.0


In [7]:
for i, ctxt in enumerate(contexts):
    bert_arr=np.empty( (len(countries),768) )
    roberta_arr=np.empty( (len(countries),768) )
    mpnet_arr=np.empty( (len(countries),768) )
    
    for ind in range(len(countries)):
        country = countries.loc[ind]
        name = country.Name
        lat = country.Latitude
        long = country.Longitude
        
        bert_out = bert.encode(ctxt+name)
        rob_out = roberta.encode(ctxt+name)
        mp_out = mpnet.encode(ctxt+name)
        
        bert_arr[ind,:] = bert_out
        roberta_arr[ind,:] = rob_out
        mpnet_arr[ind,:] = mp_out
        
        print("\r context {} : {}%".format(i,np.round((ind+1)/(len(countries))*100),4), end="")
        sys.stdout.flush()
        
    with open('embd_files/bert_countries_'+ short_ctxts[i] + '.npy','wb') as f:
        np.save(file=f,arr=bert_arr)
    with open('embd_files/roberta_countries_'+ short_ctxts[i] + '.npy','wb') as f:
        np.save(file=f,arr=roberta_arr)
    with open('embd_files/mpnet_countries_'+ short_ctxts[i] + '.npy','wb') as f:
        np.save(file=f,arr=mpnet_arr)

 context 2 : 100.0%