In [1]:
import warnings
warnings.simplefilter('ignore')
import json
import pandas as pd
import gc
from tqdm.notebook import tqdm
from ltp import LTP
import re
import jieba
import jieba.analyse
jieba.analyse.set_stop_words("./data/hit_stopwords.txt")

In [2]:
train_text_path = './data/annotations/labeled.json'
test_text_path = './data/annotations/test_a.json'
unlabeled_text_path = './data/annotations/unlabeled.json'
save_path = './temp_data/'

In [3]:
# 拼接title_ocr_asr
def load_text_data(file_path, file_type, save_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = json.load(f)
    df_text = pd.DataFrame(text)
    all_ocr_text = []
    all_time = []
    for ocr in tqdm(df_text['ocr']):
        all_ocr_text.append(', '.join([i['text'] for i in ocr]))
        all_time.append([i['time'] for i in ocr])
    df_text['ocr_text'] = all_ocr_text
    df_text['ocr_time'] = all_time
#     df_text['text'] =  df_text['title'].str[:64].astype(str) + df_text['title'].str[-64:].astype(str) +  '[SEP]' +\
#                        df_text['asr'].str[0:64].astype(str) + df_text['asr'].str[-64:].astype(str) + '[SEP]' +  \
#                        df_text['ocr_text'].str[:64].astype(str) + df_text['ocr_text'].str[-64:].astype(str)
    df_text['asr_ocr_text'] = df_text['asr'].str[0:64].astype(str) + df_text['asr'].str[-64:].astype(str) + \
                       df_text['ocr_text'].str[:64].astype(str) + df_text['ocr_text'].str[-64:].astype(str)
    df_text['all_text'] = df_text['title'].astype(str) + df_text['asr'].astype(str) + df_text['ocr_text'].astype(str)
    
    if file_type == 'train':
        df_text = df_text[['id', 'title', 'asr_ocr_text', 'all_text', 'category_id']]
    elif file_type == 'unlabeled' or file_type == 'test':
        df_text = df_text[['id', 'title', 'asr_ocr_text', 'all_text']]
    df_text.to_pickle(save_path + file_type + '_text.pkl')
    print(len(df_text))
    gc.collect()

In [4]:
load_text_data(train_text_path, file_type='train', save_path=save_path)
load_text_data(test_text_path, file_type='test', save_path=save_path)
# load_text_data(unlabeled_text_path, file_type='unlabeled', save_path=save_path)

  0%|          | 0/100000 [00:00<?, ?it/s]

100000


  0%|          | 0/25000 [00:00<?, ?it/s]

25000


In [5]:
def stopwords_list(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r',
                                               encoding='utf-8').readlines()]
    stopwords.append("\n")
    stopwords.append(" ")
    return stopwords

def get_clean_text(text_list, stopwords):
    clean_text_list = []
    for text in tqdm(text_list):
        text = re.sub("[^\u4e00-\u9fa5。？．，！：]", "", text.strip())
        clean_text = [i for i in text if i not in stopwords]
        clean_text = ''.join(clean_text)
        clean_text_list.append(clean_text)
    return clean_text_list

In [6]:
df_train = pd.read_pickle("./temp_data/train_text.pkl")
df_test = pd.read_pickle("./temp_data/test_text.pkl")
train_text = df_train['all_text'].tolist()
test_text = df_test['all_text'].tolist()
len(train_text), len(test_text)

(100000, 25000)

In [7]:
stopwords = stopwords_list('./data/hit_stopwords.txt')
train_clean_text = get_clean_text(train_text, stopwords)
test_clean_text = get_clean_text(test_text, stopwords)
df_train['clean_text'] = train_clean_text
df_test['clean_text'] = test_clean_text

  0%|          | 0/100000 [00:00<?, ?it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]

In [8]:
ltp = LTP(path='./ltp-model-small')
def get_clean_word_list(text_list):
    clean_word_lists = []
    for text in tqdm(text_list):
        clean_text = ltp.seg([text])[0][0]
        clean_text = [i for i in clean_text if i not in  stopwords]
        clean_word_lists.append(clean_text)
    return clean_word_lists

file ./ltp-model-small\config.json not found
file ./ltp-model-small\config.json not found


In [9]:
train_clean_word_list = get_clean_word_list(train_clean_text)
test_clean_word_list = get_clean_word_list(test_clean_text)
df_train['clean_word'] = train_clean_word_list
df_test['clean_word'] = test_clean_word_list

  0%|          | 0/100000 [00:00<?, ?it/s]

  0%|          | 0/25000 [00:00<?, ?it/s]

In [10]:
def get_topK_word(text_list, topK):
    keyword_list = []
    for text in tqdm(text_list):
        keyword = jieba.analyse.textrank(text,
                                         topK=topK,
                                         allowPOS=('n','nz','v','vd','vn','l','a','d'))
        keyword_list.append(keyword)
    return keyword_list

In [11]:
train_key_word = get_topK_word(train_text, topK=20)
test_key_word = get_topK_word(test_text, topK=20)
df_train['keywords'] = train_key_word
df_test['keywords'] = test_key_word

  0%|          | 0/100000 [00:00<?, ?it/s]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\B302\AppData\Local\Temp\jieba.cache
Loading model cost 0.620 seconds.
Prefix dict has been built successfully.


  0%|          | 0/25000 [00:00<?, ?it/s]

In [12]:
print(df_train.columns)
print(df_test.columns)

Index(['id', 'title', 'asr_ocr_text', 'all_text', 'category_id', 'clean_text',
       'clean_word', 'keywords'],
      dtype='object')
Index(['id', 'title', 'asr_ocr_text', 'all_text', 'clean_text', 'clean_word',
       'keywords'],
      dtype='object')


In [13]:
df_train.to_pickle("./temp_data/new_train_text.pkl")
df_test.to_pickle("./temp_data/new_test_text.pkl")

In [14]:
df_train = pd.read_pickle("./temp_data/new_train_text.pkl")
df_test = pd.read_pickle("./temp_data/new_test_text.pkl")

In [15]:
print(len(df_train))
print(len(df_test))

100000
25000


In [19]:
def concat_keyword(x):
    return ' '.join(x)
def get_title_top20(df):
    df['keyword_sentence'] = df['keywords'].apply(lambda x:concat_keyword(x))
    df['title_top20'] = df['title'].astype(str) + '[CLS]' + df['keyword_sentence'].astype(str)
    return df

In [20]:
df_train = get_title_top20(df_train)
df_test = get_title_top20(df_test)

In [22]:
df_train.to_pickle("./temp_data/new_train_text.pkl")
df_test.to_pickle("./temp_data/new_test_text.pkl")

In [23]:
df_train.columns

Index(['id', 'title', 'asr_ocr_text', 'all_text', 'category_id', 'clean_text',
       'clean_word', 'keywords', 'keyword_sentence', 'title_top20'],
      dtype='object')