In [1]:
import requests
import pyphi
import pandas as pd
from os import listdir
from os.path import isfile, join
import csv

target_dir = "/home/jupyter/notebooks/PoC/data-preparation/output/understanding_data/test_datasets/"

def predict(pair_left, pair_right):
    url = 'http://127.0.0.1:8009/predict?pair_left=' + pair_left + '&pair_right=' + pair_right
    headers = {"content-type": "application/json"}
    
    response = requests.get(url)
    if 'error' in response.text:
        #print('error in generating similarity for pairs: {pair_left}, {pair_right}'.format(pair_left=pair_left,pair_right=pair_right))
        return 0
    else:
        similarity = float(response.text)
        return similarity

def test_datasets(language_pair):
    path = target_dir + language_pair + '/'
    files = [f for f in listdir(path) if isfile(join(path, f))]
    for file in files:
        print('processing file')
        df = pd.read_csv(path + file,sep='\t', header=None)
        set_len = len(df)
        
        df['similarity'] = 0
        for index, row in df.iterrows():
            pair_left = row[1]
            pair_right = row[2]
            similarity = predict(pair_left,pair_right)
            df.loc[index, 'similarity'] = similarity
        df.to_csv(path + file,sep='\t', quoting=csv.QUOTE_NONE, mode = 'w', header=False, index=False)
        accuracy = 0
        if file.startswith('neg'):
            true_neg = df[df['similarity'] < 0.5]
            accuracy = len(true_neg) / len(df)
        elif file.startswith('pos'):
            true_pos = df[df['similarity'] > 0.5]
            accuracy = len(true_pos) / len(df)
        print('accuracy of file: {file} is {acc}'.format(file=file, acc=accuracy))

In [None]:
%%time
test_datasets('eng_arb')

In [2]:
similarity = predict('احمد','أحمد')

In [3]:
similarity

0.9752248767763376

In [None]:
%%time
from multiprocessing.dummy import Pool as ThreadPool 
pool = ThreadPool(3) 
results = pool.map(test_datasets, ['eng_eng', 'eng_arb', 'arb_arb'])

In [2]:
df = pd.read_csv(target_dir + 'eng_arb/pos_eng_arb_4x4.tsv',sep='\t', header=None)

In [3]:
df

Unnamed: 0,0,1,2
0,1,MARAM NAOMI ANNABELLA ASALA,مرآم نَؤمي انّابيلّا أصاله
1,1,MARAM NAOMI ANNABELLA ASALA,مرآم نَؤمي انّابيلّا أصالة
2,1,MARAM NAOMI ANNABELLA ASALA,مرآم نَؤمي أنابيلا أصاله
3,1,MARAM NAOMI ANNABELLA ASALA,مرآم نَؤمي أنابيلا أصالة
4,1,MARAM NAOMI ANNABELLA ASALA,مرآم نَؤمي أنّابيله أصاله
5,1,MARAM NAOMI ANNABELLA ASALA,مرآم نَؤمي أنّابيله أصالة
6,1,MARAM NAOMI ANNABELLA ASALA,مرآم نعومي انّابيلّا أصاله
7,1,MARAM NAOMI ANNABELLA ASALA,مرآم نعومي انّابيلّا أصالة
8,1,MARAM NAOMI ANNABELLA ASALA,مرآم نعومي أنابيلا أصاله
9,1,MARAM NAOMI ANNABELLA ASALA,مرآم نعومي أنابيلا أصالة


In [5]:
%%time
i = 0

df = df.sample(frac=1)
df = df.sample(frac=1)
df = df.sample(frac=0.07)
set_len = len(df)
df['similarity'] = 0
for index, row in df.iterrows():
    i += 1
    if i % 1000 == 0:
        print("processed {i} out of {t}".format(i=i, t=set_len))
    pair_left = row[1].lower()
    pair_right = row[2].lower()
    similarity = predict(pair_right, pair_left)
    df.loc[index, 'similarity'] = similarity

CPU times: user 1.77 s, sys: 140 ms, total: 1.91 s
Wall time: 6.86 s


In [4]:
df['similarity'] = 0
for index, row in df.iterrows():
    pair_left = row[1].lower()
    pair_right = row[2].lower()
    similarity = predict( pair_right, pair_right)
    df.loc[index, 'similarity'] = similarity

KeyboardInterrupt: 

In [7]:
true_pos = df[df['similarity'] >= 0.5]
accuracy = len(true_pos) / len(df[df['similarity'] > 0])

In [8]:
accuracy

0.9608585858585859

In [21]:
df[df['similarity'] < 0.5]

Unnamed: 0,0,1,2,similarity
9458,1,MICHAEL SOAD ISSAC EMMA,ميكائيل سُعاد إسحاق إيما,0.493831
9698,1,MIKHAEL SUAD ISSAC EMMA,ميخائيل سُعاد إسحاق إيما,0.493764
10050,1,MICHAIL SOAD ISAAK EMMA,ميخائيل سُعاد إسحاق إيما,0.493764
9650,1,MICHAEL SOAAD ISSAC EMMA,ميكائيل سُعاد إسحاق إيما,0.493831
9539,1,MICHAEL SUAAD ISAAK EMMA,ميخائيل سُعاد إسحاق ايمّا,0.484064
10129,1,MICHAIL SO'AD ISAAK EMMA,ميكائيل سُعاد إسحاق ايمّه,0.483813
9925,1,MIKHAEL SOAAD ISAAK EMMA,ميخائيل سُعاد اسحاق ايمّه,0.491376
10277,1,MICHAIL SOAAD ISSAC EMMA,ميخائيل سُعاد اسحاق ايمّه,0.491376
10599,1,MIKAEL SOAAD ISSAC EMMA,ميخائيل سُعاد اسحاق ايمّا,0.491411
9831,1,MIKHAEL SO'AD ISSAC EMMA,ميخائيل سُعاد اسحاق ايمّا,0.491411


In [None]:
from multiprocessing.dummy import Pool as ThreadPool 
pool = ThreadPool(4) 
results = pool.map(my_function, my_array)
