In [207]:
import os
import numpy as np
import pandas as pd
from itertools import permutations

def make_data(dataType, table):

    combs = permutations(table[['ID', 'focus_text']].values, r=2)
    combs = pd.DataFrame([[s for s in comb] for comb in combs], columns=['text_a', 'text_b'])
    temp_l = pd.DataFrame(combs['text_a'].to_list(), columns = ['id_text_a', 'focus_text_a'])
    temp_r = pd.DataFrame(combs['text_b'].to_list(), columns = ['id_text_b', 'focus_text_b'])
    combs = pd.concat([temp_l, temp_r], axis=1)
    

    if dataType=='train':
        # make label set
        labels = pd.read_csv('TrainLabel.csv')
        pairs = set()
        for _, row in labels.iterrows():
            temp = (row['Test'], row['Reference'])
            if temp not in pairs:
                pairs.add(temp)

        # mark label on text_pairs
        combs['label'] = 'unlike'
        for ind, row in combs.iterrows():
            if (row['id_text_a'], row['id_text_b']) in pairs:
                combs.at[ind, 'label'] = 'like'         
    return combs

# # do it when creating testing data !!
# table_test = table_all[table_all['ID'].isin(table_test['ID'])]
# data_test = make_data('test', table_test)


In [214]:
import time
import pandas as pd
import pickle
import math
from numpy import array
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

table_train = pd.read_pickle("./table_train_seg.pkl")
table_test = pd.read_pickle("./table_test_seg.pkl")
table_private = pd.read_pickle("./table_private_seg.pkl")
kw_dict = pd.read_pickle('./kw_dict_norm_longest.pickle')
table_all = pd.concat([table_train, table_test, table_private], axis= 0)

In [215]:
table_all # 559+421+420=1400

Unnamed: 0,ID,text,seg_text,seg_text_pos
0,1301,香蕉所表示近日低溫細雨香蕉葉部病害易發生，防檢局籲請農友加強注意病徵，以及時進行防治\n一、...,"[香蕉, 所, 表示, 近日, 低溫, 細雨, 香蕉, 葉部, 病害, 易, 發生, ，, ...","[Na, D, VE, Nd, Na, Na, Na, Na, Na, VH, VJ, CO..."
1,924,二期作水稻已陸續完成插秧，請留意防治水稻水象鼻蟲\n二期作水稻已陸續插秧，請留意防治水稻水象...,"[二, 期, 作, 水稻, 已, 陸續, 完成, 插秧, ，, 請, 留意, 防治, 水稻水...","[Neu, Nf, VC, Na, D, D, VC, VA, COMMACATEGORY,..."
2,595,鹿野鄉局部地區水稻胡麻葉枯病發生嚴重，臺東場籲請該地區農民及時防治\n臺東區農業改良場（以下...,"[鹿野鄉, 局部, 地區, 水稻, 胡麻葉枯病, 發生, 嚴重, ，, 臺東場, 籲請, 該...","[Nc, Neqa, Nc, Na, Na, VJ, VH, COMMACATEGORY, ..."
3,438,臺南區農改場籲請農友注意水稻白葉枯病、稻瘟、臭腳銅與葉鞘腐敗病之發生與防治。臺南區農改場、防...,"[臺南區, 農改場, 籲請, 農友, 注意, 水稻, 白葉枯病, 、, 稻瘟, 、, 臭腳銅...","[Nc, Nc, VF, Na, VK, Na, Na, PAUSECATEGORY, Na..."
4,344,早晚溫差大濕度高，臺南場籲請農友加強水稻稻瘟與白葉枯病防治。\n近日台南地區早晚溫差大濕度高...,"[早晚, 溫差, 大, 濕度, 高, ，, 臺南場, 籲請, 農友, 加強, 水稻, 稻瘟,...","[D, Na, VH, Na, VH, COMMACATEGORY, Nc, VF, Na,..."
...,...,...,...,...
415,316,氣候炎熱小心紅蜘蛛，請農友注意番玉荷包荔枝紅蜘蛛防治。臺東區農改場、防檢局及田邊好幫手關心...,"[氣候, 炎熱, 小心, 紅蜘蛛, ，, 請, 農友, 注意, 番, 玉荷包荔枝, 紅蜘蛛,...","[Na, VH, VK, Na, COMMACATEGORY, VF, Na, VK, Nf..."
416,99,發布本(106)年第一期作水稻葉稻熱病發生預報。桃子園區農改場、防檢局及田邊好幫手關心您。\...,"[發布, 本, (106), 年, 第一, 期, 作, 水稻, 葉稻熱病, 發生, 預報, ...","[VC, Nes, Neu, Nf, Neu, Nf, VC, Na, Na, VJ, Na..."
417,1223,臺灣本土黑螞蟻數量增加，民眾應留意及適時防治。雲林防疫所、防檢局及田邊好幫手關心您。\n近年...,"[臺灣本土黑螞蟻, 數量, 增加, ，, 民眾, 應, 留意, 及, 適時, 防治, 。, ...","[Na, Na, VHC, COMMACATEGORY, Na, D, VK, Caa, D..."
418,1397,水稻陸續抽穗，籲請農友注意防治穗稻熱病、紋枯病及螟蟲。雲林縣政府、防檢局及田邊好幫手關心您。...,"[水稻, 陸續, 抽穗, ，, 籲請, 農友, 注意, 防治, 穗稻熱病, 、, 紋枯病, ...","[Na, D, VA, COMMACATEGORY, VF, Na, VK, VC, Na,..."


In [217]:
# 想調的東西：d2v_vector_size, d2v_window_size, d2v_epochs, pos, 

def find_pos(consider):
    considered_text = []
    for _, row in table_all.iterrows():
        words = row.seg_text
        poss = row.seg_text_pos
        temp_text = []
        for word, pos in zip(words, poss):
            if pos in consider:
                temp_text.append(word)
        considered_text.append(temp_text)
    table_all['focus_text'] = considered_text

    
def d2v(vsize, wsize, ep):
    docs = []
    for _, row in table_all.iterrows():
        words = row.focus_text
        docs.append(TaggedDocument(words, [str(row.ID)]))
    # model = Doc2Vec(vector_size=400, window=10, min_count=1, workers=10, dm=0, epochs=80)
    model = Doc2Vec(vector_size=vsize, window=wsize, epochs=ep, \
                    min_count=1, workers=10, dm=0)
    model.build_vocab(docs)
    model.train(docs, total_examples=model.corpus_count, epochs=80)
    # print(model.dv[1].shape)
    return model
    
def no_need_adjust(md):
    ############ permutations ############
    table_train = pd.read_pickle("./table_train_seg.pkl")
    table_train = table_all[table_all['ID'].isin(table_train['ID'])]
    data_train = make_data('train', table_train)
    categories = {"unlike":0, "like":1}
    data_train['label_bin'] = data_train.label.apply(lambda t: categories[t])
    similarities = []
    for _, row in data_train.iterrows():
        sim = md.dv.similarity(str(row.id_text_a), str(row.id_text_b))
        similarities.append(sim)
    data_train['similarity'] = similarities

    for i in range(1,10):
        threshold = float(i)/10
        y_true = data_train['label_bin']
        y_pred = []
        for _, row in data_train.iterrows():
            if row.similarity > threshold:
                y_pred.append(1)
            else:
                y_pred.append(0)
        temp = classification_report(y_true,y_pred,target_names=['unlike', 'like'], \
                                     output_dict=True)
        print ("threshold=", threshold, ", f1:", temp['like']['f1-score'])

        
        
CONSIDER = ["Na", "Nc", "Nd", "Nv", "VH", "VJ"]
VSIZE = [400]
WSIZE = [10]
EP = [80]
# for n_pos in range(4, len(CONSIDER)+1):
n_pos=2
find_pos(CONSIDER[:n_pos])
for vs in VSIZE:
    for ws in WSIZE:
        for ep in EP:
            MD = d2v(vs, ws, ep)
            print ("------------------------------")
            print (CONSIDER[:n_pos])
            print ("d2v vector size:", vs, "\nd2v window size:", ws, "\nepochs:", ep)
            no_need_adjust(MD)
                
    


------------------------------
['Na', 'Nc']
d2v vector size: 400 
d2v window size: 10 
epochs: 80
threshold= 0.1 , f1: 0.008924962408446377
threshold= 0.2 , f1: 0.010080004093402678
threshold= 0.3 , f1: 0.015619717744939064
threshold= 0.4 , f1: 0.03579697486447892
threshold= 0.5 , f1: 0.09278831868924826
threshold= 0.6 , f1: 0.20274551214361136
threshold= 0.7 , f1: 0.30644002872875264
threshold= 0.8 , f1: 0.3007578779417631
threshold= 0.9 , f1: 0.18094667465548236


In [None]:
df = pd.read_csv("three_in_one.csv", header=None)

In [230]:
# 想調的東西：d2v_vector_size, d2v_window_size, d2v_epochs, pos, 

def find_pos(consider):
    considered_text = []
    for _, row in table_all.iterrows():
        words = row.seg_text
        poss = row.seg_text_pos
        temp_text = []
        for word, pos in zip(words, poss):
            if pos in consider:
                temp_text.append(word)
        considered_text.append(temp_text)
    table_all['focus_text'] = considered_text

    
def d2v(vsize, wsize, ep):
    docs = []
    for _, row in table_all.iterrows():
        words = row.focus_text
        docs.append(TaggedDocument(words, [str(row.ID)]))
    print ("len(docs):", len(docs))
    model = Doc2Vec(vector_size=400, window=10, min_count=1, workers=10, dm=0, epochs=80)
#     model = Doc2Vec(docs, vector_size=vsize, window=wsize, epochs=ep, \
#                     min_count=1, workers=10, dm=0)
    model.build_vocab(docs)
    model.train(docs, total_examples=model.corpus_count, epochs=80)
    # print(model.dv[1].shape)
    return model
    
def no_need_adjust(md):
    ############ permutations ############
    table_private = pd.read_pickle("./table_private_seg.pkl")
    table_private = table_all[table_all['ID'].isin(table_private['ID'])]
    data_private = make_data('private', table_private)
    
    similarities = []
    for _, row in data_private.iterrows():
        sim = md.dv.similarity(str(row.id_text_a), str(row.id_text_b))
        similarities.append(sim)
    data_private['similarity'] = similarities

    threshold = 0.75
    y_pred = []
    for _, row in data_private.iterrows():
        if row.similarity > threshold:
            y_pred.append(1)
        else:
            y_pred.append(0)
    data_private['pred']=y_pred
    return data_private
    


CONSIDER = ["Na", "Nc", "Nd", "Nv", "VH", "VJ"]
VSIZE = [400]
WSIZE = [10]
EP = [80]
data_private = pd.DataFrame()
# for n_pos in range(4, len(CONSIDER)+1):
for n_pos in range(2, 3):
# n_pos=2
    find_pos(CONSIDER[:n_pos])
    for vs in VSIZE:
        for ws in WSIZE:
            for ep in EP:
                MD = d2v(vs, ws, ep)
                print ("------------------------------")
                print (CONSIDER[:n_pos])
                print ("d2v vector size:", vs, "\nd2v window size:", ws, "\nepochs:", ep)
                data_private = no_need_adjust(MD)
                
    


len(docs): 1400
------------------------------
['Na', 'Nc']
d2v vector size: 400 
d2v window size: 10 
epochs: 80


In [232]:
data_private.loc[data_private['pred']==1]

Unnamed: 0,id_text_a,focus_text_a,id_text_b,focus_text_b,similarity,pred
222,708,"[小黃薊馬, 密度, 台南, 農改場, 果實, 品質, 台南區, 水果, 檬果, 麻豆文旦,...",639,"[柑桔, 類, 果實, 幼果期, 台南, 農改場, 警訊, 薊馬, 果實, 品質, 柑桔, ...",0.797454,1
474,883,"[寒流, 作物, 寒害, 措施, 中央, 氣象局, 預報, 大陸, 冷氣團, 地, 氣溫, ...",884,"[低溫, 農作物, 措施, 中央, 氣象局, 低溫, 特報, 地區, 攝氏, 低溫, 花蓮,...",0.873841,1
704,883,"[寒流, 作物, 寒害, 措施, 中央, 氣象局, 預報, 大陸, 冷氣團, 地, 氣溫, ...",843,"[氣溫, 農改場, 作物, 措施, 寒害, 中央, 氣象局, 預報, 台灣, 地區, 寒流,...",0.857060,1
1070,837,"[水稻, 秧苗期, 水, 象鼻蟲, 行政院, 農業, 委員會, 花蓮區, 農業, 改良場, ...",920,"[水稻, 水稻水象鼻蟲, 水稻, 水稻水象鼻蟲, 行政院, 農業, 委員會, 桃子, 園區,...",0.873588,1
1222,837,"[水稻, 秧苗期, 水, 象鼻蟲, 行政院, 農業, 委員會, 花蓮區, 農業, 改良場, ...",995,"[水稻, 水稻水象鼻蟲, 水稻, 水稻水象鼻蟲, 行政院, 農業, 委員會, 動植物, 檢疫...",0.851986,1
...,...,...,...,...,...,...
175711,1171,"[蕉區, 葉部, 病害, 情報, 蕉農, 社員, 中株期, 蕉株, 葉部, 病害, 工作, ...",1192,"[蕉區, 葉部, 病害, 情報, 蕉農, 地, 蕉園, 套袋期, 蕉株, 葉部, 病害, 效...",0.821660,1
175737,1171,"[蕉區, 葉部, 病害, 情報, 蕉農, 社員, 中株期, 蕉株, 葉部, 病害, 工作, ...",1189,"[蕉區, 葉部, 病害, 情報, 地, 氣溫, 蕉葉, 露水, 黑星病, 措施, 蕉農, 葉...",0.815978,1
175779,1171,"[蕉區, 葉部, 病害, 情報, 蕉農, 社員, 中株期, 蕉株, 葉部, 病害, 工作, ...",1174,"[蕉區, 葉部, 病害, 情報, 蕉農, 省, 蕉區, 蕉株, 期, 蕉株, 新葉, 蕉株,...",0.851686,1
175822,1171,"[蕉區, 葉部, 病害, 情報, 蕉農, 社員, 中株期, 蕉株, 葉部, 病害, 工作, ...",1210,"[蕉農, 香蕉, 葉部, 病害, 果房, 黑星病, 工作, 香蕉, 產量, 品質, 薔蜜, ...",0.760651,1


In [233]:
submission_list = []
for _, row in data_private.iterrows():
    if row.pred == 1:
        temp = [str(row.id_text_a), str(row.id_text_b)]
        submission_list.append(temp)

In [237]:
submission = pd.DataFrame(submission_list,
                   columns=['Test', 'Reference'])
submission.to_csv('submission_d2v.csv', index=0)

In [191]:
# 剛才改了dm --> 變好了！
# pos: ["Na", "Nd", "Nv", "VH"]
# d2v: vector_size=400, window=10, min_count=1, workers=10, dm=0, epochs=80
# threshold = 0.7


# y_true = data_train['label_bin']
# y_pred = []
# for _, row in data_train.iterrows():
#     if row.similarity > threshold:
#         y_pred.append(1)
#     else:
#         y_pred.append(0)

temp = classification_report(y_true,y_pred,target_names=['unlike', 'like'], output_dict=True)
print ("threshold=", threshold, ", f1:", temp['like']['f1-score'])
print (confusion_matrix(y_true, y_pred))

    

threshold= 0.7 , f1: 0.3040127557799628
[[308731   1810]
 [   809    572]]


In [186]:
# # Train
# model = doc2vec.Doc2Vec(docs, vector_size=300, window=300, min_count=3, workers=10, dm=1, epochs=20)

# # Save
# model.save('doc2vec.model')

# # Load
# model = doc2vec.Doc2Vec.load('doc2vec.model')
# print(model.dv[1].shape)

0.13449564134495642

In [137]:
labels = pd.read_csv('TrainLabel.csv')
test_candi = pd.unique(labels['Test'])

test = 1004
similar_doc = model.dv.most_similar(str(test))
# print(similar_doc)
print (str(table_all.loc[table_all['ID']== test]['text']))
for i in similar_doc:
    print (i)
    print (table_all.loc[table_all['ID']== int(i[0])]['text'])


420    全民一起來監測紅火蟻\n農委會防檢局表示，自3月份起氣溫逐漸回暖，正是紅火蟻開始活躍的季節，...
Name: text, dtype: object
('175', 0.6140390634536743)
413    宜蘭防疫所表示，管仔蟲及黑蟲密度增加，請農友加強防治措施，避免損失。\n管仔蟲及黑蟲密度已達...
Name: text, dtype: object
('185', 0.6038188338279724)
382    黑蟲密度增加，請農友加強防治措施，避免損失。宜蘭縣動植物防疫所、防檢局及田邊好幫手關心你。\...
Name: text, dtype: object
('969', 0.5945085287094116)
46    本局發佈「全國滅鼠週」農地野鼠防除成果新聞稿\n動植物防疫檢疫局於本（九十）年三月三十日至四...
Name: text, dtype: object
('1205', 0.5778266191482544)
383    請香蕉產銷業者確實做好開花蕉株果房疏果、整梳及套袋工作，以提高蕉果合格率\n一、蕉株生育期間...
Name: text, dtype: object
('50', 0.5733256340026855)
16    苗栗地區發布水稻螟蟲、葉尾蟲及飛蝨類蟲害防治警報，籲請農友注意防範。苗栗區農改場、防檢局及田...
Name: text, dtype: object
('956', 0.5725455284118652)
74    蘇鐵白輪盾介殼蟲防治宣導摺頁\n蘇鐵白輪盾介殼蟲防治宣導摺頁內容已定稿，詳如附件，請卓參。俟...
Name: text, dtype: object
('1313', 0.5698922276496887)
144    蕉株抽穗期間，務必加強宣導蕉農適時做好花薊馬防治作業，俾確保生產外觀優良品質香蕉，香蕉研究所...
Name: text, dtype: object
('178', 0.5668834447860718)
545    中度颱風「蘇迪勒」侵襲後，籲請農友儘速做好復耕復育管理措施。臺中區農改場、防檢局及田邊好幫手...
Name: text, dtype: object
('1213', 0.5668527483940125)
41

In [139]:
model.dv.similarity('1004', "185")

0.60381883

In [65]:
docs[0]

TaggedDocument(words=['香蕉', '近日', '低溫', '細雨', '香蕉', '葉部', '病害', '易', '農友', '病徵', '近期', '細雨', '溫度', '低', '露水量', '多', '香蕉', '病菌', '感染', '漫延', '葉片', '嚴重', '發病', '枯萎', '健葉數', '冬', '春', '蕉果', '成熟期', '長', '日後', '香蕉', '果串', '飽熟', '品質', '抽穗', '時機', '積極', '香蕉', '葉片', '工作', '蕉株', '正常', '發育', '產量', '品質', '藥劑', '嚴重', '受害', '病葉', '病源', '有效性', '終花期', '果串', '適當', '套袋', '套袋', '凹槽', '雨水', '套袋', '套袋', '破裂', '病原菌', '虞'], tags=['1301'])

In [69]:
'_1301_'.strip("_")

'1301'

In [67]:
model.corpus_count

980