In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
from time import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train.shape
test.shape

(1306122, 3)

(56370, 2)

In [4]:
test['target'] = -1
df = pd.concat([train, test], axis=0)

df.shape

(1362492, 3)

In [5]:
import gensim
import nltk
import re

In [6]:
stopwords = ["a","about","above","after","again","against","all","am","an","and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from","further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its","itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were","weren't","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with","won't","would","wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"]
len(stopwords)

174

In [7]:
# 去除停用词、符号
def sent2words(sentence):
    words = []
    for w in wordNormal(sentence):
        if w not in stopwords and w != '' and re.compile(r'[1-9]\d*\.\d*|0\.\d*[1-9]|[1-9]\d*').findall(w) == []:
            words.append(w)
    return words

In [8]:
# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return None

def wordNormal(sentence):
    sentence = re.sub("[+\.\!\/_,$%^*(+\"\'\°]+|[+——():~]+", '', sentence)
    sentence = re.sub("-", ' ', sentence)
    tokens = nltk.word_tokenize(sentence)
    tagged_sent = nltk.pos_tag(tokens)
    
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or nltk.corpus.wordnet.NOUN
        lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
    return lemmas_sent

In [9]:
sent2words('in this paper, we study the temperature sensitivity of an electrical tracing-assisted dual-microring optical sensor, which consists of a sensing ring to detect the refractive index change on its surface and a tracing ring to trace the resonance wavelength shift of the sensing ring by the thermo-optic effect with a heating electrode on it. the wavelength shift measurement is therefore changed to electrical power variation measurement. thanks to the real-time compensation effect of the tracing ring, the temperature dependence of the sensor is found to be intrinsically low. the resonance wavelength temperature sensitivity difference between the two rings is measured to be as low as 10.1 pm/°c, showing that the temperature dependence of the sensor in terms of wavelength per degree is reduced by ∼6 times compared to that of a single ring sensor. the temperature sensitivity of the sensor in terms of electrical power per degree is measured to be -0.077 mw/°c. by using tracing ring with enhanced tuning efficiency, this value can be further decreased to -0.0057 w/°c. the experimental results agree well with the expectation. this type of sensors with low temperature dependence has great potential to be deployed in various practical point-of-care diagnostic applications.')

['paper',
 'study',
 'temperature',
 'sensitivity',
 'electrical',
 'tracing',
 'assist',
 'dual',
 'microring',
 'optical',
 'sensor',
 'consist',
 'sense',
 'ring',
 'detect',
 'refractive',
 'index',
 'change',
 'surface',
 'trace',
 'ring',
 'trace',
 'resonance',
 'wavelength',
 'shift',
 'sense',
 'ring',
 'thermo',
 'optic',
 'effect',
 'heating',
 'electrode',
 'wavelength',
 'shift',
 'measurement',
 'therefore',
 'change',
 'electrical',
 'power',
 'variation',
 'measurement',
 'thanks',
 'real',
 'time',
 'compensation',
 'effect',
 'trace',
 'ring',
 'temperature',
 'dependence',
 'sensor',
 'find',
 'intrinsically',
 'low',
 'resonance',
 'wavelength',
 'temperature',
 'sensitivity',
 'difference',
 'two',
 'ring',
 'measure',
 'low',
 'pmc',
 'show',
 'temperature',
 'dependence',
 'sensor',
 'term',
 'wavelength',
 'per',
 'degree',
 'reduce',
 'time',
 'compare',
 'single',
 'ring',
 'sensor',
 'temperature',
 'sensitivity',
 'sensor',
 'term',
 'electrical',
 'power',
