In [3]:
# load package
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# read article
df_all = pd.read_csv('../../input/stage1_unigram.csv', header = 0, names=["id", "article", "words"])
# df_all = pd.read_csv('../../input/stage2_unigram.csv', header = 0, names=["id", "article", "words"])
# df_all = df_all[0:5]
df_all

Unnamed: 0,id,article,words
0,1,梅雨季來臨，文旦黑點病易發生，請注意病徵，以及早加強防治措施。 5月已進入梅雨季節，近日連續...,"['梅雨季', '來臨', '文旦', '黑點病', '易_發生', '請', '注意', ..."
1,10,天氣多變溫差大，近山區及偏施氮肥田區稻熱病發病較為嚴重，籲請農友注意防治。花蓮區農改場、防檢...,"['天氣', '多變', '溫差', '山區', '偏施_氮肥', '田區', '稻熱病',..."
2,1000,新聞稿-稻熱病進入好發季節，防檢局籲請農友加強防範 行政院農業委員會動植物防疫檢疫局（以下簡...,"['新聞稿', '稻熱病', '進入', '好發_季節', '防檢局', '籲請_農友', ..."
3,1005,稻熱病進入好發季節，防檢局籲請農友加強防治 農委會防檢局表示，自3月起全國各地水稻生長陸續進...,"['稻熱病', '進入', '好發_季節', '防檢局', '籲請_農友_加強', '防治'..."
4,1007,乍暖還寒，防檢局籲請農友加強防治稻熱病 農委會防檢局表示，全國各地水稻生長陸續進入分蘗期，因...,"['乍暖還寒', '防檢局', '籲請_農友_加強', '防治', '稻熱病', '農委會'..."
...,...,...,...
555,986,苗栗區農業改良場發佈水稻白葉枯病警報 糧食作物病蟲害發生警報中華民國90年9月21日發佈第3...,"['苗栗_區_農業_改良場', '發佈', '水稻_白葉枯病', '警報', '糧食作物',..."
556,988,雨後適合稻熱病發生，請持續進行監測並指導農民防治 依據氣象預報，今年自五月中旬起，臺灣地區即...,"['雨_後', '適合', '稻熱病', '發生', '請', '持續', '進行', '監..."
557,992,新入侵果實蠅緊急撲滅模擬演習 新聞稿 新入侵植物害蟲緊急撲滅演習產官學總動員嚴防外來疫病蟲...,"['新', '入侵', '果實蠅', '緊急_撲滅', '模擬', '演習', '新聞稿',..."
558,997,梨木蝨危害，請農友注意防範 梨木蝨危害，請農友注意防範行政院農業委員會動植物防疫檢疫局(以下...,"['梨木蝨_危害', '請_農友', '注意_防範', '梨木蝨_危害', '請_農友', ..."


In [5]:
# read result file
result_all_df = pd.read_csv('../../output/results_similarity_jac.csv', header = 0)
result_all_df

Unnamed: 0,Test,Reference,Similarity,Jaccard_Similarity(%)
0,1,10,0.000000,0.058252
1,1,1000,0.000000,0.045267
2,1,1005,0.000000,0.047826
3,1,1007,0.000000,0.040541
4,1,1010,0.142857,0.043668
...,...,...,...,...
313035,998,984,0.000000,0.053488
313036,998,986,0.000000,0.027875
313037,998,988,0.000000,0.024390
313038,998,992,0.333333,0.100000


In [6]:
# compute tf
vectorizer = CountVectorizer() 
text_count_vector = vectorizer.fit_transform(df_all.words)
tf_vector = text_count_vector.toarray()
tf_vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 4, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [7]:
# compute tf-idf
tfidf_transfomer = TfidfTransformer()
docs_tfidf = tfidf_transfomer.fit_transform(text_count_vector)
tfidf_vector = docs_tfidf.toarray()
tfidf_vector

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.13911019, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [8]:
tf_similarity = cosine_similarity(tf_vector, tf_vector)

# 删除對角線元素(自己對自己)
tf_similarity = tf_similarity[~np.eye(tf_similarity.shape[0],dtype=bool)].reshape(tf_similarity.shape[0],-1)
tf_similarity.flatten()

array([0.14483718, 0.09373729, 0.11238305, ..., 0.17766014, 0.15816274,
       0.4641453 ])

In [9]:
tfidf_similarity = cosine_similarity(tfidf_vector, tfidf_vector)

# 删除對角線元素(自己對自己)
tfidf_similarity = tfidf_similarity[~np.eye(tfidf_similarity.shape[0],dtype=bool)].reshape(tfidf_similarity.shape[0],-1)
tfidf_similarity.flatten()

array([0.0569182 , 0.02838194, 0.03228771, ..., 0.03317734, 0.09825458,
       0.25117637])

In [10]:
result_df = pd.DataFrame({"TF-CosineSimilarity":tf_similarity.flatten(),
                          "TFIDF-CosineSimilarity":tfidf_similarity.flatten()})
result_df

Unnamed: 0,TF-CosineSimilarity,TFIDF-CosineSimilarity
0,0.144837,0.056918
1,0.093737,0.028382
2,0.112383,0.032288
3,0.109976,0.030105
4,0.081238,0.019695
...,...,...
313035,0.224078,0.046496
313036,0.127763,0.023124
313037,0.177660,0.033177
313038,0.158163,0.098255


In [100]:
combine_result = pd.concat([result_all_df,result_df], axis=1)
combine_result

Unnamed: 0,Test,Reference,Similarity,Jaccard_Similarity(%),TF-CosineSimilarity,TFIDF-CosineSimilarity
0,1,10,0.000000,0.058252,0.144837,0.056918
1,1,1000,0.000000,0.045267,0.093737,0.028382
2,1,1005,0.000000,0.047826,0.112383,0.032288
3,1,1007,0.000000,0.040541,0.109976,0.030105
4,1,1010,0.142857,0.043668,0.081238,0.019695
...,...,...,...,...,...,...
313035,998,984,0.000000,0.053488,0.224078,0.046496
313036,998,986,0.000000,0.027875,0.127763,0.023124
313037,998,988,0.000000,0.024390,0.177660,0.033177
313038,998,992,0.333333,0.100000,0.158163,0.098255


In [102]:
combine_result.to_csv("../output/results_similarity_jac_cos_stage1_faster.csv", index= False)
# combine_result.to_csv("../output/results_simi_cosSimi_stage2_faster.csv", index= False)

In [85]:
# End of file