# Import

In [1]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd
import nltk
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

# Hyper-parameters

In [2]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

MAX_SAMPLE = None # set a small number for experimentation, set None for production.

# Load data

In [3]:
train_path = './input/train.csv'
paper_train_folder = './input/train'

paper_sample_folder='./input/test'
sample_sub = pd.read_csv('./input/sample_submission.csv')

train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]
print(f'No. raw training rows: {len(train)}')

No. raw training rows: 19661


Group by publication, training labels should have the same form as expected output.

In [4]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

No. grouped training rows: 14316


In [5]:
def read_papers(paper_folder,data_df):
    papers = {}
    for paper_id in data_df['Id'].unique():
        with open(f'{paper_folder}/{paper_id}.json', 'r') as f:
            paper = json.load(f)
            papers[paper_id] = paper
    return papers

# Transform data to NER format

In [6]:

def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()  
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

In [7]:
#返回ner_data
cnt_pos, cnt_neg = 0, 0 # number of sentences that contain/not contain labels
ner_data = []
papers_train=read_papers(paper_train_folder,train)
pbar = tqdm(total=len(train))
for i, id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    # paper
    paper = papers_train[id]
    
    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]
    
    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']): 
            
            #ner_data.append(tags)
            cnt_neg += 1
    
    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")

# shuffling
random.shuffle(ner_data)

Training data size: 47202 positives + 514263 negatives: 100%|███████████████████| 14316/14316 [01:38<00:00, 163.48it/s]

In [8]:
words_list,nes_list=[],[]
for row in ner_data:
    words, nes = list(zip(*row))
    words_list.append(list(words))
    nes_list.append(list(nes))


In [9]:
#crf标注
import sklearn_crfsuite
from sklearn_crfsuite import CRF   # CRF的具体实现太过复杂，这里我们借助一个外部的库


def word2features(sent, i,pos_sent):
    """抽取单个字的特征"""
    word = sent[i]
    postag = pos_sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        #提取单词后缀，比如ly结尾可能代表副词
        "word[-3:]": word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        #提取前一个单词特征
        word1 = sent[i-1]
        postag1 = pos_sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        #提取后一个单词特征
        word1 = sent[i+1]
        postag1 = pos_sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent):
    """抽取序列特征"""
    pos_sent=nltk.tag.pos_tag(sent)
    return [word2features(sent, i,pos_sent) for i in range(len(sent))]



In [10]:
%%time
X_train = [sent2features(s) for s in words_list]
y_train = nes_list


Training data size: 47202 positives + 514263 negatives: 100%|███████████████████| 14316/14316 [01:50<00:00, 163.48it/s]

Wall time: 1min 52s


In [11]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 1min 7s


In [12]:

papers_sample=read_papers(paper_sample_folder,sample_sub)
sentences_sample={}

In [25]:
datasets_all=[]
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()
for id in sample_sub['Id'].unique():
    # paper
    paper = papers_sample[id]
    
    # sentences
    sentences_sample = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') ])
    
    sentences_sample = shorten_sentences(sentences_sample) # make sentences short
    sentences_sample = [sentence for sentence in sentences_sample if len(sentence) > 10] 
    sentences_sample=[sentence.split(" ") for sentence in sentences_sample]
    features_sample=[sent2features(s) for s in sentences_sample]
    
    #预测并提取预测结果
    y_pred = crf.predict(features_sample)
    datasets=[]
    for i  in range(len(y_pred)):
        if "B" in y_pred[i]:
            dataset=""
            for j in range(len(y_pred[i])):
                if y_pred[i][j]=="B":
                    if dataset!="":
                        dataset=dataset.rstrip()
                        dataset=clean_text(dataset)
                        if dataset  not in datasets:
                            datasets.append(dataset)
                        dataset=""
                    dataset+=sentences_sample[i][j]+" "
                elif y_pred[i][j]=="I":
                    dataset+=sentences_sample[i][j]+" "
            if dataset!="":
                dataset=dataset.rstrip()
                dataset=clean_text(dataset)
                if dataset  not in datasets:
                    datasets.append(dataset)
    
    datasets="|".join(datasets)
    datasets_all.append(datasets)
    print(datasets)



alzheimer s disease neuroimaging initiative adni
trends in international mathematics and science study|education major field of study|nces|common core of data|trends in international mathematics
coastal change hazards|nc sea level rise risk management study|slosh safir|slosh meows|noaa storm surge inundation|las dataset data management in arcgis|slosh model
rural urban continuum codes


In [26]:
#string matchinf  return literal_preds 
all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')


def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt
literal_preds = []

for paper_id in sample_sub['Id']:
    paper = papers_sample[paper_id]
    text_1 = '. '.join(section['text'] for section in paper).lower()
    text_2 = totally_clean_text(text_1)
    
    labels = set()
    for label in all_labels:
        if label in text_1 or label in text_2:
            labels.add(clean_text(label))
    
    literal_preds.append('|'.join(labels))

No. different labels: 638


In [27]:
for i in range(len(datasets_all)):
    if literal_preds[i]!="":
        if datasets_all[i]=="":
            datasets_all[i]=literal_preds[i]
        else:
            datasets_all[i]+="|"+literal_preds[i]
    datasets_all[i]=datasets_all[i].split("|")
    datasets_all[i]=list(set(datasets_all[i]))
    datasets_all[i]="|".join(datasets_all[i])

In [28]:
my_submission=pd.DataFrame({"Id":sample_sub['Id'],"PredictionString":datasets_all})

In [29]:
my_submission.to_csv('submission.csv', index=False)

In [30]:
datasets_all

['alzheimer s disease neuroimaging initiative adni|adni',
 'common core of data|nces|trends in international mathematics and science study|trends in international mathematics|education major field of study',
 'nc sea level rise risk management study|noaa storm surge inundation|slosh meows|slosh model|las dataset data management in arcgis|slosh safir|coastal change hazards|sea lake and overland surges from hurricanes',
 'rural urban continuum codes']