In [116]:
import sys
import json
import numpy as np
import pandas as pd

def normalize(arr):
    norm = np.linalg.norm(arr)
    if norm == 0.0:
        return arr
    return np.divide(arr, norm)

In [148]:
title_agi_vector_list = []
title_set = set()
title_agi_vector_file = open("title_agi_vector_json.tsv", "r", encoding = "cp1252")
for line in title_agi_vector_file:
    try:
        parsed_result = json.loads(line.strip())[0]
        title = parsed_result['query']
        vector = normalize(parsed_result['vector'])
        if title not in title_set:
            title_agi_vector_list.append((title, vector))
            title_set.add(parsed_result['query'])
    except:
        continue

title_agi_vector_file.close()

title_df = pd.DataFrame(title_agi_vector_list, columns = ['Title', 'TitleAgiVector'])
title_df["JoinKey"] = 1
title_df.count()

Title             405
TitleAgiVector    405
JoinKey           405
dtype: int64

In [149]:
snippet_agi_vector_list = []
snippet_set = set()
snippet_agi_vector_file = open("snippet_agi_vector_json.tsv", "r", encoding = 'cp1252')
for line in snippet_agi_vector_file:
    try:
        parsed_result = json.loads(line.strip())[0]
        snippet = parsed_result['query']
        vector = normalize(parsed_result['vector'])
        if snippet not in snippet_set:
            snippet_agi_vector_list.append((snippet, vector))
            snippet_set.add(snippet)
    except:
        continue

snippet_agi_vector_file.close()

snippet_df = pd.DataFrame(snippet_agi_vector_list, columns = ["Snippet", "SnippetAgiVector"])
snippet_df["JoinKey"] = 1
snippet_df.count()

Snippet             386
SnippetAgiVector    386
JoinKey             386
dtype: int64

In [150]:
title_title_df = pd.merge(title_df, title_df, on = ["JoinKey"])
title_title_df = title_title_df.drop("JoinKey", axis=1)
title_title_df.count()

Title_x             164025
TitleAgiVector_x    164025
Title_y             164025
TitleAgiVector_y    164025
dtype: int64

In [183]:
# axis=0 or axis="index", apply functions to each column
# axis=1 or axis="column", apply functions to each row
title_title_df['Similarity1'] = title_title_df.apply(lambda r: np.dot(r['TitleAgiVector_x'], r['TitleAgiVector_y']), axis="columns")
title_title_df.head()

Unnamed: 0,Title_x,TitleAgiVector_x,Title_y,TitleAgiVector_y,Similarity,Similarity1
30856,阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",1.0,1.0
30894,阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",淘宝海量数据库的设计和实现,"[-0.05795493055917471, -0.10417753143380089, -...",0.979902,0.979902
30973,阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",刘佳_出生证明公证办理委托书,"[-0.05795493055917471, -0.10417753143380089, -...",0.979902,0.979902
30835,阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",贴吧大数据存储解决方案,"[-0.05553168131325587, -0.09946641756622951, -...",0.979322,0.979322
30843,阿里浏览器DNS解析加速,"[-0.03440448356489144, -0.12447976191659889, -...",工作6周年5个月小记,"[-0.053652287028072544, -0.09610203085849282, ...",0.977716,0.977716


In [None]:
for i, row in title_title_df.iterrows():
    #print(np.dot(list(map,float(row['TitleAgiVector_x'])), list(map(float(row['TitleAgiVector_y'])))))
    title_title_df.loc[i,"Similarity"] = np.dot(row['TitleAgiVector_x'], row['TitleAgiVector_y'])
    if i%10000==0:
        print(i)
title_title_df.head()

In [172]:
title_measurement_file = codecs.open("title_measurement.tsv", "w", encoding="utf-8")

title_title_df= title_title_df.sort_values(by = ['Title_x','Similarity'], ascending = [False,False])
title_title_df_top10 = title_title_df.groupby('Title_x')["Title_x", "Title_y", "Similarity"].head(10)

title_set = set()
for index, row in title_title_df_top10.iterrows():
    if row['Title_x'] not in title_set:
        title_set.add(row['Title_x'])
        title_measurement_file.write("\n{0}\t{1}\t{2}\n".format(row['Title_x'], row['Title_y'], row['Similarity']))
    else:
        title_measurement_file.write("{0}\t{1}\t{2}\n".format(row['Title_x'], row['Title_y'], row['Similarity']))
    if index%1000 == 0:
        print(index)

print("{0} titles got 10 similar titles".format(len(title_set)))
title_measurement_file.close()

87000
54000
96000
114000
0
1000
81000
143000
95000
405 titles got 10 similar titles


In [None]:
title_recall_file = codecs.open("title_recall_file.tsv", "w", encoding = "utf-8")
title_set = set()
for index, row in title_title_df_top10.iterrows():
    if float(row['Similarity']) >= 0.5 and row['Title_x'] != row['Title_y']:
        if row['Title_x'] not in title_set:
            title_set.add(row['Title_x'])
            title_recall_file.write("\n{0}\t{1}\t{2}\n".format(row['Title_x'], row['Title_y'], row['Similarity']))
        else:
            title_recall_file.write("{0}\t{1}\t{2}\n".format(row['Title_x'], row['Title_y'], row['Similarity']))
    if index % 10000:
        print(index)

print("{0} titles got titles with similarity >= 0.5".format(len(title_set)))
title_recall_file.close()

In [154]:
title_snippet_df = pd.merge(title_df, snippet_df, on = ["JoinKey"])
title_snippet_df = title_snippet_df.drop("JoinKey", axis=1)
title_snippet_df.count()

Title               156330
TitleAgiVector      156330
Snippet             156330
SnippetAgiVector    156330
dtype: int64

In [None]:
for i, row in title_snippet_df.iterrows():
    title_snippet_df.loc[i, "Similarity"] = np.dot(row["TitleAgiVector"], row["SnippetAgiVector"])
    if i % 10000 == 0:
        print(i)
title_snippet_df.head()

In [178]:
snippet_measurement_file = codecs.open("snippet_measurement.tsv", "w", encoding="utf-8")

title_snippet_df= title_snippet_df.sort_values(by = ['Title','Similarity'], ascending = [False,False])
title_snippet_df_top10 = title_snippet_df.groupby('Title')["Title", "Snippet", "Similarity"].head(10)

snippet_set = set()
for index, row in title_snippet_df_top10.iterrows():
    if row['Title'] not in snippet_set:
        snippet_set.add(row['Title'])
        snippet_measurement_file.write("\n{0}\t{1}\t{2}\n".format(row['Title'], row['Snippet'], row['Similarity']))
    else:
        snippet_measurement_file.write("{0}\t{1}\t{2}\n".format(row['Title'], row['Snippet'], row['Similarity']))
    if index%1000 == 0:
        print(index)

print("{0} titles have similarity scores with snippets".format(len(snippet_set)))
snippet_measurement_file.close()

29000
95000
0
405 titles have similarity scores with snippets


In [179]:
snippet_recall_file = codecs.open("snippet_recall.tsv", "w", encoding = "utf-8")

snippet_set = set()
for index, row in title_snippet_df_top10.iterrows():
    if float(row['Similarity']) >= 0.5 and float(row['Similarity']) < 1:
        if row['Title'] not in snippet_set:
            snippet_set.add(row['Title'])
            snippet_recall_file.write("\n{0}\t{1}\t{2}\n".format(row['Title'], row['Snippet'], row['Similarity']))
        else:
            snippet_recall_file.write("{0}\t{1}\t{2}\n".format(row['Title'], row['Snippet'], row['Similarity']))
    if index%1000 == 0:
        print(index)
    
print("{0} titles have similarity >= 0.5 with other documents".format(len(snippet_set)))
snippet_recall_file.close()

29000
95000
0
219 titles have similarity >= 0.5 with other documents
