In [116]:
import sys
import json
import numpy as np
import pandas as pd

def normalize(arr):
    norm = np.linalg.norm(arr)
    if norm == 0.0:
        return arr
    return np.divide(arr, norm)

In [148]:
title_agi_vector_list = []
title_set = set()
title_agi_vector_file = open("title_agi_vector_json.tsv", "r", encoding = "cp1252")
for line in title_agi_vector_file:
    try:
        parsed_result = json.loads(line.strip())[0]
        title = parsed_result['query']
        vector = normalize(parsed_result['vector'])
        if title not in title_set:
            title_agi_vector_list.append((title, vector))
            title_set.add(parsed_result['query'])
    except:
        continue

title_agi_vector_file.close()

title_df = pd.DataFrame(title_agi_vector_list, columns = ['Title', 'TitleAgiVector'])
title_df["JoinKey"] = 1
title_df.count()

Title             405
TitleAgiVector    405
JoinKey           405
dtype: int64

In [149]:
snippet_agi_vector_list = []
snippet_set = set()
snippet_agi_vector_file = open("snippet_agi_vector_json.tsv", "r", encoding = 'cp1252')
for line in snippet_agi_vector_file:
    try:
        parsed_result = json.loads(line.strip())[0]
        snippet = parsed_result['query']
        vector = normalize(parsed_result['vector'])
        if snippet not in snippet_set:
            snippet_agi_vector_list.append((snippet, vector))
            snippet_set.add(snippet)
    except:
        continue

snippet_agi_vector_file.close()

snippet_df = pd.DataFrame(snippet_agi_vector_list, columns = ["Snippet", "SnippetAgiVector"])
snippet_df["JoinKey"] = 1
snippet_df.count()

Snippet             386
SnippetAgiVector    386
JoinKey             386
dtype: int64

In [150]:
title_title_df = pd.merge(title_df, title_df, on = ["JoinKey"])
title_title_df = title_title_df.drop("JoinKey", axis=1)
title_title_df.count()

Title_x             164025
TitleAgiVector_x    164025
Title_y             164025
TitleAgiVector_y    164025
dtype: int64

In [183]:
# axis=0 or axis="index", apply functions to each column
# axis=1 or axis="column", apply functions to each row
title_title_df['Similarity1'] = title_title_df.apply(lambda r: np.dot(r['TitleAgiVector_x'], r['TitleAgiVector_y']), axis="columns")
title_title_df.head()

Unnamed: 0,Title_x,TitleAgiVector_x,Title_y,TitleAgiVector_y,Similarity,Similarity1
30856,阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",1.0,1.0
30894,阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",淘宝海量数据库的设计和实现,"[-0.05795493055917471, -0.10417753143380089, -...",0.979902,0.979902
30973,阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",刘佳_出生证明公证办理委托书,"[-0.05795493055917471, -0.10417753143380089, -...",0.979902,0.979902
30835,阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",贴吧大数据存储解决方案,"[-0.05553168131325587, -0.09946641756622951, -...",0.979322,0.979322
30843,阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",工作6周年5个月小记,"[-0.053652287028072544, -0.09610203085849282, ...",0.977716,0.977716


In [151]:
for i, row in title_title_df.iterrows():
    #print(np.dot(list(map,float(row['TitleAgiVector_x'])), list(map(float(row['TitleAgiVector_y'])))))
    title_title_df.loc[i,"Similarity"] = np.dot(row['TitleAgiVector_x'], row['TitleAgiVector_y'])
    if i%10000==0:
        print(i)
title_title_df.head()

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000


Unnamed: 0,Title_x,TitleAgiVector_x,Title_y,TitleAgiVector_y,Similarity
0,ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...",ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...",1.0
1,ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...",2015-12-17 Core Web Relevance FY16H2 review,"[-0.13764570256399866, 0.10410324663081968, -0...",0.131971
2,ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...",Disitributed Deep Learning-Tie-Yan Liu,"[0.14862133226677363, -0.20638940356011046, 0....",0.045864
3,ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...",BW_MM_Rank,"[-0.10559122449749322, -0.001136281203092085, ...",0.270482
4,ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...",Dynamic Ranking Overview2,"[0.02797839889548373, -0.014213222132077522, -...",0.137874


In [172]:
title_measurement_file = codecs.open("title_measurement.tsv", "w", encoding="utf-8")

title_title_df= title_title_df.sort_values(by = ['Title_x','Similarity'], ascending = [False,False])
title_title_df_top10 = title_title_df.groupby('Title_x')["Title_x", "Title_y", "Similarity"].head(10)

title_set = set()
for index, row in title_title_df_top10.iterrows():
    if row['Title_x'] not in title_set:
        title_set.add(row['Title_x'])
        title_measurement_file.write("\n{0}\t{1}\t{2}\n".format(row['Title_x'], row['Title_y'], row['Similarity']))
    else:
        title_measurement_file.write("{0}\t{1}\t{2}\n".format(row['Title_x'], row['Title_y'], row['Similarity']))
    if index%1000 == 0:
        print(index)

print("{0} titles got 10 similar titles".format(len(title_set)))
title_measurement_file.close()

87000
54000
96000
114000
0
1000
81000
143000
95000
405 titles got 10 similar titles


In [175]:
title_recall_file = codecs.open("title_recall_file.tsv", "w", encoding = "utf-8")
title_set = set()
for index, row in title_title_df_top10.iterrows():
    if float(row['Similarity']) >= 0.5 and row['Title_x'] != row['Title_y']:
        if row['Title_x'] not in title_set:
            title_set.add(row['Title_x'])
            title_recall_file.write("\n{0}\t{1}\t{2}\n".format(row['Title_x'], row['Title_y'], row['Similarity']))
        else:
            title_recall_file.write("{0}\t{1}\t{2}\n".format(row['Title_x'], row['Title_y'], row['Similarity']))
    if index % 10000:
        print(index)

print("{0} titles got titles with similarity >= 0.5".format(len(title_set)))
title_recall_file.close()

30856
30894
30973
30835
30843
30840
30909
30838
30896
30908
49938
50209
49929
50008
49891
49870
49878
49875
49944
49922
159964
159693
159684
159763
159646
159625
159701
159633
159955
159630
22330
22338
22389
22468
22335
22404
22333
22391
22403
22636
37758
37789
37790
37949
37780
37851
37782
37787
38055
37723
43442
43395
43464
43393
43451
43463
43696
43398
43457
43390
49532
49468
49526
49538
49771
49525
49596
49470
49539
49473
115144
115145
115304
115135
115206
115142
115078
115136
115148
115381
24360
24429
24363
24358
24416
24428
24661
24355
24422
24414
46284
46363
46225
46233
46230
46299
46228
46286
46298
46531
156310
156319
156056
156039
156118
155980
156001
155988
156048
155985
146263
146321
146333
146566
146265
146334
146327
146268
146320
146391
23548
23606
23618
23851
23550
23619
23612
23553
23605
23676
50344
50345
50504
50335
50406
50342
50278
50336
50348
50581
47038
47096
47108
47341
47040
47109
47102
47043
47095
47166
158340
158072
158065
158136
158008
158066
158078
158311
1580

118146
118196
118012
118071
118185
117975
118067
118069
118068
118253
11368
11460
11675
11723
11575
11518
11727
11618
11738
11341
145348
145027
145225
145362
145360
145265
145235
145388
145202
145252
151032
151064
150840
151038
150669
150982
150673
150671
150963
150841
88508
88633
88489
88453
88344
88662
88631
88410
88581
88506
86072
86073
86074
85939
86190
86151
86005
85912
86132
86095
86478
86477
86479
86344
86556
86410
86317
86595
86457
86537
14616
14763
14589
14631
14811
14591
14593
14814
14819
14810
110026
109993
109992
110098
109791
110152
109994
109806
110096
109975
96222
96223
96256
96382
96000
96125
96225
96328
96358
96224
96628
96627
96661
96787
96630
96733
96405
96426
96530
96763
97034
97035
97090
96872
96846
96837
96831
97015
97048
96987
97440
97277
97206
97439
97422
97427
97298
97425
97426
97438
9338
9666
9563
9418
9461
9548
9460
9367
9589
9349
138852
138857
138790
138789
138552
138866
138525
138528
138633
138711
45066
45247
45078
45048
45295
44978
45040
45122
45297
45129


65112
65014
65091
64956
64938
8120
8150
8332
8311
8166
8232
8351
8405
8444
8175
141288
141233
141108
141201
141222
141202
141021
141247
141161
141242
66584
66705
66424
66788
66445
66731
66696
66514
66576
66446
150626
150511
150347
150540
150317
150404
150424
150520
150546
150324
31262
31425
31191
31424
31407
31412
31410
31411
31283
31508
119770
119714
119526
119784
119785
119493
119672
119641
119873
119821
121800
121688
121687
121683
121669
121746
121503
121515
121846
121807
124642
124637
124415
124472
124417
124436
124652
124498
124503
124672
122612
122392
122473
122617
122627
122364
122628
122447
122509
122411
128702
128687
128467
128584
128703
128704
128691
128692
128701
128486
33292
33512
33347
33527
33528
33373
33517
33587
33529
33546
21924
22033
22036
22172
22159
21952
21901
22069
22187
22088
66178
66069
66181
66317
66304
66046
66097
66214
66332
66152
124236
124247
124232
124013
124093
124035
123935
124012
123995
123984
15834
15812
15816
15908
16039
15963
16038
16027
16040
16174


In [154]:
title_snippet_df = pd.merge(title_df, snippet_df, on = ["JoinKey"])
title_snippet_df = title_snippet_df.drop("JoinKey", axis=1)
title_snippet_df.count()

Title               156330
TitleAgiVector      156330
Snippet             156330
SnippetAgiVector    156330
dtype: int64

In [155]:
for i, row in title_snippet_df.iterrows():
    title_snippet_df.loc[i, "Similarity"] = np.dot(row["TitleAgiVector"], row["SnippetAgiVector"])
    if i % 10000 == 0:
        print(i)
title_snippet_df.head()

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000


Unnamed: 0,Title,TitleAgiVector,Snippet,SnippetAgiVector,Similarity
0,ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...",ML Model Scorecard Analysis8/8/2019Unexpected ...,"[0.025657233844473028, -0.03330579433486664, 0...",0.710113
1,ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...","Microsoft ConfidentialSeptember 14, 2016Core W...","[0.028864041064186222, -0.09409187327270806, -...",0.074364
2,ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...",Distributed Deep Learning: New Driving Force o...,"[-0.08437865813324187, -0.09784304842483896, -...",0.104313
3,ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...",Blue-Whale Multimedia Rank Data-FlowEugene Jia...,"[-0.060043619586054045, 0.11120253242871189, 0...",-0.011583
4,ML Model Scorecard analysis,"[-0.006709893352180249, -0.0199083593462621, 0...",Introduction to dynamic ranking Xiao WuOutline...,"[-0.13616929380055495, 0.08240760143962789, -0...",0.168641


In [178]:
snippet_measurement_file = codecs.open("snippet_measurement.tsv", "w", encoding="utf-8")

title_snippet_df= title_snippet_df.sort_values(by = ['Title','Similarity'], ascending = [False,False])
title_snippet_df_top10 = title_snippet_df.groupby('Title')["Title", "Snippet", "Similarity"].head(10)

snippet_set = set()
for index, row in title_snippet_df_top10.iterrows():
    if row['Title'] not in snippet_set:
        snippet_set.add(row['Title'])
        snippet_measurement_file.write("\n{0}\t{1}\t{2}\n".format(row['Title'], row['Snippet'], row['Similarity']))
    else:
        snippet_measurement_file.write("{0}\t{1}\t{2}\n".format(row['Title'], row['Snippet'], row['Similarity']))
    if index%1000 == 0:
        print(index)

print("{0} titles have similarity scores with snippets".format(len(snippet_set)))
snippet_measurement_file.close()

29000
95000
0
405 titles have similarity scores with snippets


In [179]:
snippet_recall_file = codecs.open("snippet_recall.tsv", "w", encoding = "utf-8")

snippet_set = set()
for index, row in title_snippet_df_top10.iterrows():
    if float(row['Similarity']) >= 0.5 and float(row['Similarity']) < 1:
        if row['Title'] not in snippet_set:
            snippet_set.add(row['Title'])
            snippet_recall_file.write("\n{0}\t{1}\t{2}\n".format(row['Title'], row['Snippet'], row['Similarity']))
        else:
            snippet_recall_file.write("{0}\t{1}\t{2}\n".format(row['Title'], row['Snippet'], row['Similarity']))
    if index%1000 == 0:
        print(index)
    
print("{0} titles have similarity >= 0.5 with other documents".format(len(snippet_set)))
snippet_recall_file.close()

29000
95000
0
219 titles have similarity >= 0.5 with other documents
