In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

import jieba
from gensim import corpora,models,similarities
from collections import defaultdict
import warnings

In [2]:
train_path = "/Users/jhihchingyeh/Final Project/Dataset/stage1/"
test_path = "/Users/jhihchingyeh/Final Project/Dataset/stage3/"

train_txtpath = "dataTrainComplete/"
test_txtpath = "dataPrivateComplete/"

place = ["臺北", "新北", "桃園", "臺中", "臺南", "高雄市",
             "新竹", "苗栗", "彰化", "南投", "雲林", "嘉義", "屏東",
             "宜蘭", "花蓮", "臺東", "澎湖", "金門", "連江"]

In [3]:
# Read Data
def read_text(path, txtpath):
    # 1. txt
    # Construct an empty dataframe to store txt data
    df = pd.DataFrame(columns = ["ID", "text"])
    k = 0
    # Read txt and Store into df
    for i in range(1402):
        text = []
        txt_name = str(i) + ".txt"
        txt_path = path + txtpath + txt_name
        try:
            f = open(txt_path, 'r')
            text = f.read()
            f.close
            df.loc[k, 'ID'] = i
            df.loc[k, 'text'] = text
            k = k + 1
        except:
            pass
    # Change to array
    arr_df = np.array(df["text"])


    # 2. Keyword.xlsx
    # Read excel
    key_chem = pd.read_excel(path+"Keywords/02chem.list.xlsx", header=None, index_col=False)
    key_crop = pd.read_excel(path+"Keywords/02crop.list.xlsx", header=None, index_col=False)
    key_pest = pd.read_excel(path+"Keywords/02pest.list.xlsx", header=None, index_col=False)
    # Merge them
    frames = [key_chem, key_crop, key_pest]
    keyword = pd.concat(frames, axis=0)
    

    # 3. Train Label.csv
    # Test data does not have label
    try: 
        label_path = path + "TrainLabel.csv"
        label = pd.read_csv(label_path)
    except:
        pass

    return df, keyword

In [4]:
# Replace the keyword by the shortest synonym
def replace_keyword(df, keyword):
    # Make and sort keyword lists
    sort_keyword_list = []
    num_cols = keyword.shape[1]
    for ind, row in keyword.iterrows():
        temp = [x for x in list(row) if pd.isnull(x) == False]
        temp.sort(key=len, reverse=True)
        sort_keyword_list.append(temp)
    sort_keyword_list = sorted(sort_keyword_list, key=lambda x: len(x[0]), reverse=True)
    
    # Replace 
    for i in range(len(df)):
        # Replace 台 into 臺
        if "台" in df["text"][i]:
            df["text"][i] = df["text"][i].replace("台", "臺")
        # Replace the keyword by the shortest synonym
        for j in range(len(sort_keyword_list)):
            for k in sort_keyword_list[j][1:]:
                key = str(k)
                if key in df["text"][i]:
                    #print(key, sort_keyword_list[j][0])
                    df["text"][i] = df["text"][i].replace(key, sort_keyword_list[j][0])
    return df, sort_keyword_list

In [5]:
# Split df dataframe to smaller one(keyword_df) by Keyword  
def keyword_df(df, sort_keyword_list, path):
    for i in range(len(sort_keyword_list)):
        # Build keyword_n dataframe to store the acticle with the same keyword
        df_temp_keyword = pd.DataFrame(columns = ["ID", "text"])
        for j in range(len(df)):
            if sort_keyword_list[i][0] in df["text"][j]:
                df_temp_keyword = df_temp_keyword.append(df[j:j+1], ignore_index=True) 
        # Save each dataframe
        df_temp_keyword.to_csv(path+"keyword_df/keyword_"+str(i)+".csv", index=False)

In [6]:
# Split keyword_df dataframe to smaller one(place_keyword_df) by place  
def place_keyword_df(sort_keyword_list, path):
    place = ["臺北", "新北", "桃園", "臺中", "臺南", "高雄市",
             "新竹", "苗栗", "彰化", "南投", "雲林", "嘉義", "屏東",
             "宜蘭", "花蓮", "臺東", "澎湖", "金門", "連江"]
    
    for i in range(len(sort_keyword_list)):
        each_keyword_df = pd.read_csv(path+"keyword_df/keyword_"+str(i)+".csv")
        for j in range(len(place)):
            # Build place_n dataframe to store the acticle with the same keyword and place
            df_temp_place = pd.DataFrame(columns = ["ID", "text"])
            for k in range(len(each_keyword_df)):
                if place[j] in each_keyword_df["text"][k]:
                    df_temp_place = df_temp_place.append(each_keyword_df[k:k+1], ignore_index=True)
            # Save each dataframe
            df_temp_place.to_csv(path+"place_keyword_df/"+"key"+str(i)+"_place"+str(j)+".csv", index=False)

In [7]:
# Prediction
def prediction(submission, m, smaller_df, all_list, dictionary, prob):
    # Calculate the similarity
    for k in range(len(smaller_df)):
        # Sparse Matrix
        new_vec = dictionary.doc2bow(smaller_df["jieba"][k])
        corpus = [dictionary.doc2bow(ii) for ii in all_list]
        tfidf = models.TfidfModel(corpus)
        featureNUM = len(dictionary.token2id.keys())
        index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=featureNUM)
        sim = index[tfidf[new_vec]]
        # Similarity Probability
        for s in range(len(sim)):
            if (sim[s]<1)&(sim[s]>prob) == True:
                if (str(smaller_df["ID"][k]) != str(smaller_df["ID"][s])):
                    #print(smaller_df["ID"][k], smaller_df["ID"][s])
                    submission.loc[m, "Test"] = smaller_df["ID"][k]
                    submission.loc[m, "Reference"] = smaller_df["ID"][s]
                    m += 1
    return submission, m

In [8]:
# jieba Dictionary
def jieba_dict(smaller_df):
    # ignore warning
    warnings.filterwarnings('ignore')
    warnings.warn('DelftStack')
    warnings.warn('Do not show this message')
    all_list = []
    smaller_df["jieba"] = None
    for i in range(len(smaller_df)):
        data_ = []
        list_ = []
        data_ = jieba.cut(smaller_df["text"][i])
        for j in data_:
            list_.append(j)
        smaller_df["jieba"][i] = list_
    # Calculate the Frequency of Term
    all_list = smaller_df['jieba'].values.tolist()
    frequency = defaultdict(int)
    for m in all_list:
        for n in m:
            frequency[n] += 1
            
    # Build the Dictionary
    dictionary = corpora.Dictionary(all_list)
    return smaller_df, all_list, dictionary

In [9]:
# Model Prediction by place and keyword
def term_model_pred(path, sort_keyword_list, place, prob):
    # Save answer
    m = 0
    submission = pd.read_csv(path+"submission_example.csv")
    # Read each smaller dataframe
    for a in range(len(sort_keyword_list)):
        for b in range(len(place)):
            smaller_df = pd.read_csv(path+"place_keyword_df/"+"key"+str(a)+"_place"+str(b)+".csv")
            # - jieba
            smaller_df, all_list, dictionary = jieba_dict(smaller_df)
            # - Prediction
            submission, m = prediction(submission, m, smaller_df, all_list, dictionary, prob)
            
    return submission, m

In [10]:
# Model Prediction by similarity
def sim_model_pred(path, df, prob, submission, m):
    # - jieba
    df, all_list, dictionary = jieba_dict(df)

    # - Prediction
    submission, m = prediction(submission, m, df, all_list, dictionary, prob)
            
    return submission

In [11]:
df, keyword = read_text(test_path, test_txtpath)

In [12]:
df, sort_keyword_list = replace_keyword(df, keyword)

In [13]:
keyword_df(df, sort_keyword_list, test_path)

In [14]:
place_keyword_df(sort_keyword_list, test_path)

In [15]:
submission, m = term_model_pred(test_path, sort_keyword_list, place, 0.35) 

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/2s/tlvgbrd16jx_vcmf7z8m3sbm0000gn/T/jieba.cache
Loading model cost 0.425 seconds.
Prefix dict has been built successfully.


In [16]:
submission = sim_model_pred(test_path, df, 0.7, submission, m) 

In [17]:
submission_no_duplicates = submission.drop_duplicates()

In [18]:
submission_no_duplicates = submission_no_duplicates.reset_index(drop=True)

In [19]:
submission_no_duplicates

Unnamed: 0,Test,Reference
0,232,253
1,253,232
2,320,322
3,320,328
4,320,379
...,...,...
675,1348,1291
676,1348,1294
677,1348,1356
678,1356,1335


In [20]:
submission_no_duplicates.to_csv(test_path+"result.csv", index=False)

## train

In [21]:
df, keyword = read_text(train_path, train_txtpath)

In [22]:
df, sort_keyword_list = replace_keyword(df, keyword)

In [23]:
keyword_df(df, sort_keyword_list, train_path)

In [24]:
place_keyword_df(sort_keyword_list, train_path)

In [25]:
def train_acc(submission):
    # Label
    label = pd.read_csv(r"/Users/jhihchingyeh/Final Project/Dataset/stage1/TrainLabel.csv")
    for i in range(len(label)):
        if int(label["Test"][i]) > int(label["Reference"][i]):
            temp = label["Test"][i]
            label["Test"][i] = label["Reference"][i]
            label["Reference"][i] = temp
    
    # Prediction
    # Choose the best prob
    prob = 0.6
    while prob < 1:
        submission, m = term_model_pred(train_path, sort_keyword_list, place, prob)
        for i in range(len(submission)):
            if int(submission["Test"][i]) > int(submission["Reference"][i]):
                temp = submission["Test"][i]
                submission["Test"][i] = submission["Reference"][i]
                submission["Reference"][i] = temp
        
        # F1-score
        tp = 0
        fp = 0
        fn = 0
        Precision = 0
        Recall = 0
        F1 = 0
        for m in range(len(submission)):
            testID = submission["Test"][m]
            referenceID = submission["Reference"][m]
            label_mask = label[label["Test"] == testID]
            label_mask = label_mask.reset_index(drop=True)
            if len(label_mask) == 0:
                fp = fp + 1
            else:
                for n in range(len(label_mask)):
                    if label_mask["Reference"][n] == referenceID:
                        tp = tp + 1
                    else:
                        fn = fn + 1



        Precision = tp /(tp+fp)
        Recall = tp /(tp+fn)
        F1 = 2 / ((1/Precision) + (1/Recall))
        
        print("prob: ", prob)
        print("F1: ", F1)
        prob = prob +0.1