In [1]:
import pandas as pd

dataset = pd.read_csv('../data/task2_trainset.csv', dtype=str)
dataset.head()

Unnamed: 0,Id,Title,Abstract,Authors,Categories,Created Date,Task 2
0,D00001,A Brain-Inspired Trust Management Model to Ass...,Rapid popularity of Internet of Things (IoT) a...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,THEORETICAL
1,D00002,On Efficient Computation of Shortest Dubins Pa...,"In this paper, we address the problem of compu...",Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,THEORETICAL
2,D00003,Data-driven Upsampling of Point Clouds,High quality upsampling of sparse 3D point clo...,Zhang/Jiang/Yang/Yamakawa/Shimada/Kara,cs.CV,2018-07-07,ENGINEERING
3,D00004,Accessibility or Usability of InteractSE? A He...,Internet is the main source of information now...,Aqle/Khowaja/Al-Thani,cs.HC,2018-08-29,EMPIRICAL
4,D00005,Spatio-Temporal Facial Expression Recognition ...,Automated Facial Expression Recognition (FER) ...,Hasani/Mahoor,cs.CV,2017-03-20,ENGINEERING


In [2]:
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)

In [3]:
dataset.head()

Unnamed: 0,Id,Abstract,Task 2
0,D00001,Rapid popularity of Internet of Things (IoT) a...,THEORETICAL
1,D00002,"In this paper, we address the problem of compu...",THEORETICAL
2,D00003,High quality upsampling of sparse 3D point clo...,ENGINEERING
3,D00004,Internet is the main source of information now...,EMPIRICAL
4,D00005,Automated Facial Expression Recognition (FER) ...,ENGINEERING


In [4]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

trainset, validset = train_test_split(dataset, test_size=0.1, random_state=42)

trainset.to_csv('trainset.csv', index=False)
validset.to_csv('validset.csv', index=False)

In [5]:
dataset = pd.read_csv('../data/task2_public_testset.csv', dtype=str)
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)
dataset.to_csv('testset.csv',index=False)

In [6]:
from transformers import BertTokenizer, BertForMaskedLM

PRETRAINED_MODEL_NAME = 'bert-base-uncased'

NUM_LABLES = 4

In [7]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

vocab = tokenizer.vocab
print("字典大小：", len(vocab))

字典大小： 30522


In [8]:
tokenizer.convert_tokens_to_ids("The")

100

In [9]:
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool
        
def sentence_to_indices(sentence, tokenizer):
    """ Convert sentence to its word indices.
    Args:
        sentence (str): One string.
    Return:
        indices (list of int): List of word indices.
    """
    return [tokenizer.convert_tokens_to_ids(word) for word in tokenizer.tokenize(sentence)]
    
def get_dataset(data_path, tokenizer, n_workers=4):
    """ Load data and return dataset for training and validating.

    Args:
        data_path (str): Path to the data.
    """
    dataset = pd.read_csv(data_path, dtype=str)

    results = [None] * n_workers
    with Pool(processes=n_workers) as pool:
        for i in range(n_workers):
            batch_start = (len(dataset) // n_workers) * i
            if i == n_workers - 1:
                batch_end = len(dataset)
            else:
                batch_end = (len(dataset) // n_workers) * (i + 1)
            
            batch = dataset[batch_start: batch_end]
            results[i] = pool.apply_async(preprocess_samples, args=(batch, tokenizer))

        pool.close()
        pool.join()

    processed = []
    for result in results:
        processed += result.get()
    return processed

def preprocess_samples(dataset, tokenizer):
    """ Worker function.

    Args:
        dataset (list of dict)
    Returns:
        list of processed dict.
    """
    processed = []
    for sample in tqdm(dataset.iterrows(), total=len(dataset)):
        processed.append(preprocess_sample(sample[1], tokenizer))

    return processed

def preprocess_sample(data, tokenizer):
    """
    Args:
        data (dict)
    Returns:
        dict
    """
    processed = [sentence_to_indices(sent, tokenizer) for sent in data['Abstract'].split('$$$')]
    processed = sum(processed, [])
    
    return processed

In [10]:
print('[INFO] Start processing trainset...')
train = get_dataset('trainset.csv', tokenizer, n_workers=4)
print('[INFO] Start processing validset...')
valid = get_dataset('validset.csv', tokenizer, n_workers=4)
print('[INFO] Start processing testset...')
test = get_dataset('testset.csv', tokenizer, n_workers=4)

[INFO] Start processing trainset...




[INFO] Start processing validset...




[INFO] Start processing testset...






## TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
    
def identity_tokenizer(text):
    return text

def handle_tfidf(datas):
    vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
    return vectorizer, vectorizer.fit_transform(datas)

def get_tfidf(datas):
    vectorizer, tfidf = handle_tfidf(datas)
    return data_fit_tfidf(datas, tfidf.toarray(), vectorizer.vocabulary_)

def data_fit_tfidf(datas, tfidf, vocab):
    tfidfs = []
    for idx, data in enumerate(datas):
        word_idxs = [vocab[word] for word in data]
        tfidfs.append([tfidf[idx, word_idx] for word_idx in word_idxs])
    return tfidfs

In [12]:
tfidf = get_tfidf(train + valid + test)
train_tfidf = tfidf[:6300]
valid_tfidf = tfidf[6300:7000]
test_tfidf = tfidf[7000:]