In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

import jieba
from gensim import corpora,models,similarities
from collections import defaultdict

In [2]:
train_path = "/Users/jhihchingyeh/Final Project/Dataset/stage1/"
test_path = "/Users/jhihchingyeh/Final Project/Dataset/stage2/"

train_txtpath = "dataTrainComplete/"
test_txtpath = "dataPublicComplete/"

In [3]:
# Read Data
def read_text(path, txtpath):
    # 1. txt
    # Construct an empty dataframe to store txt data
    df = pd.DataFrame(columns = ["ID", "text"])
    k = 0
    # Read txt and Store into df
    for i in range(1402):
        text = []
        txt_name = str(i) + ".txt"
        txt_path = path + txtpath + txt_name
        try:
            f = open(txt_path, 'r')
            text = f.read()
            f.close
            df.loc[k, 'ID'] = i
            df.loc[k, 'text'] = text
            k = k + 1
        except:
            pass
    # Change to array
    arr_df = np.array(df["text"])


    # 2. Keyword.xlsx
    # Read excel
    key_chem = pd.read_excel(path+"Keywords/02chem.list.xlsx", header=None, index_col=False)
    key_crop = pd.read_excel(path+"Keywords/02crop.list.xlsx", header=None, index_col=False)
    key_pest = pd.read_excel(path+"Keywords/02pest.list.xlsx", header=None, index_col=False)
    # Merge them
    frames = [key_chem, key_crop, key_pest]
    keyword = pd.concat(frames, axis=0)
    

    # 3. Train Label.csv
    # Test data does not have label
    try: 
        label_path = path + "TrainLabel.csv"
        label = pd.read_csv(label_path)
    except:
        pass

    return df, keyword

In [4]:
# Replace the keyword by the shortest synonym
def replace_keyword(df, keyword):
    # Make and sort keyword lists
    sort_keyword_list = []
    num_cols = keyword.shape[1]
    for ind, row in keyword.iterrows():
        temp = [x for x in list(row) if pd.isnull(x) == False]
        temp.sort(key=len, reverse=True)
        sort_keyword_list.append(temp)
    sort_keyword_list = sorted(sort_keyword_list, key=lambda x: len(x[0]), reverse=True)
    
    # Replace the keyword by the shortest synonym
    for i in range(len(df)):
        for j in range(len(sort_keyword_list)):
            for k in sort_keyword_list[j][1:]:
                key = str(k)
                if key in df["text"][i]:
                    #print(key, sort_keyword_list[j][0])
                    df["text"][i] = df["text"][i].replace(key, sort_keyword_list[j][0])
    return df

In [5]:
# jieba Dictionary
def jieba_dict(df):
    df["jieba"] = None
    for i in range(len(df)):
        data_ = []
        list_ = []
        data_ = jieba.cut(df["text"][i])
        for j in data_:
            list_.append(j)
        df.loc[i,"jieba"] = list_
    # Calculate the Frequency of Term
    all_list = df['jieba'].values.tolist()
    frequency = defaultdict(int)
    for m in all_list:
        for n in m:
            frequency[n] += 1
            
    # Build the Dictionary
    dictionary = corpora.Dictionary(all_list)
    return df, all_list, dictionary

In [6]:
# Prediction
def prediction(df, all_list, dictionary):
    # Submission.csv
    submission = pd.read_csv(train_path+"submission_example.csv")
    m = 0
    
    # Calculate the similarity
    for k in range(len(df)):
        # Sparse Matrix
        new_vec = dictionary.doc2bow(df["jieba"][k])
        corpus = [dictionary.doc2bow(i) for i in all_list]
        tfidf = models.TfidfModel(corpus)
        featureNUM = len(dictionary.token2id.keys())
        index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=featureNUM)
        sim = index[tfidf[new_vec]]
        # Similarity Probability
        for s in range(len(sim)):
            if (sim[s]<1)&(sim[s]>0.6) == True:
                if (str(df["ID"][k]) != str(df["ID"][s])):
                    #print(df["ID"][k], df["ID"][s])
                    submission.loc[m, "Test"] = df["ID"][k]
                    submission.loc[m, "Reference"] = df["ID"][s]
                    m += 1
    return submission

In [7]:
df, keyword = read_text(test_path, test_txtpath)

In [8]:
df = replace_keyword(df, keyword)

In [9]:
df, all_list, dictionary = jieba_dict(df)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/2s/tlvgbrd16jx_vcmf7z8m3sbm0000gn/T/jieba.cache
Loading model cost 0.416 seconds.
Prefix dict has been built successfully.


In [10]:
submission = prediction(df, all_list, dictionary)

In [11]:
submission

Unnamed: 0,Test,Reference,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,13.0,410.0,,,,,
1,24.0,31.0,,,,,
2,24.0,50.0,,,,,
3,24.0,489.0,,,,,
4,24.0,499.0,,,,,
...,...,...,...,...,...,...,...
523,1349.0,1318.0,,,,,
524,1353.0,1344.0,,,,,
525,1398.0,331.0,,,,,
526,1398.0,427.0,,,,,


In [12]:
submission.to_csv(test_path+"result.csv", index=False)