### 这部分对训练集和测试集做相同的预处理。
大概可以理解为特征工程

In [1]:
import pandas as pd
filepath = 'C:/Users\Administrator\Desktop/'

In [2]:
train_data = pd.read_csv(filepath+'train.csv')
test_data = pd.read_csv(filepath+'test.csv')

In [7]:
data_combine = pd.concat((train_data,test_data),axis=0)

In [10]:
data_combine.describe()

Unnamed: 0,author,id,text
count,19579,27971,27971
unique,3,27971,27971
top,EAP,id20584,"When age fell upon the world, and wonder went ..."
freq,7900,1,1


训练集也没有缺失值

In [12]:
import nltk

In [14]:
data_combine['word_split'] = data_combine.text.apply(lambda x:nltk.word_tokenize(x))

In [20]:
%time
data_combine['text_len'] = data_combine.word_split.apply(lambda x:len(x))

Wall time: 0 ns


In [27]:
# 添加第一个特征：句子长度
data_combine = data_combine.drop('word_len',axis=1)

In [37]:
# 添加第二个特征：标点符号个数
symbol = [',','.','?',':',"'"]
def get_symbol_num(x):
    s = 0
    for i in x:
        if i in symbol:
            s+=1
    return s
data_combine['symbol_num'] = data_combine.word_split.apply(lambda x:get_symbol_num(x))

In [38]:
# 添加第3-7个特征：各个标点符号的个数
def get_each_symbol_num(x,i):
    s = 0
    for j in x:
        if j==i:
            s+=1
    return s
for i in symbol:
    data_combine['symbol_num_'+i] = data_combine.word_split.apply(lambda x:get_each_symbol_num(x,i))

In [44]:
data_combine = data_combine.drop('symbol_num_"',axis=1)

In [50]:
from nltk import WordNetLemmatizer

In [58]:
# 添加第8特征：stopword词个数
%time
from nltk.corpus import stopwords
stopword = stopwords.words()
def get_num(x,y):
    s = 0
    for i in x:
        if i.lower() in y or WordNetLemmatizer().lemmatize(i.lower()) in y:
            s+=1
    return s
data_combine['stopword_num'] = data_combine.word_split.apply(lambda x:get_num(x,stopword))

Wall time: 0 ns


In [62]:
# 添加第9个特征：单词平均长度
def avg(x):
    s = 0
    l = 0
    for i in x:
        if i not in symbol:
            s+=len(i)
            l+=1
    return s/l
data_combine['word_mean_length'] = data_combine.word_split.apply(lambda x:avg(x))
%time

Wall time: 0 ns


In [67]:
# 添加第10个特征：字母个数
def char_sum(x):
    s = 0
    for i in x:
        if i not in symbol:
            s+=len(i)
    return s
data_combine['char_num'] = data_combine.word_split.apply(lambda x:char_sum(x))

In [69]:
# 添加第11个特征：不重复出现的单词个数
def unique(x):
    a = []
    for i in x:
        if i not in symbol:
            a.append(i)
    return(len(set(a)))
data_combine['unique_word'] = data_combine.word_split.apply(lambda x:unique(x))

接下来要处理挖掘出来的那些有区分性的单词，目前有下面这么三种想法
* 直接把其中有代表性的单词取出来做一个one-hot型的变量
* 把这些单词做一次SVD后放进去  
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
* 先用一个简单模型针对这些单词特征做一次训练，把结果作为最终模型的特征  
我还在想用哪一种方法，或许在使用集成学习的时候我们可以这样不同的操作数据集。  

我先做tfidf  
这里要用一个sklearn里面的tf-idf模型，这个模型的好处在于它可以自动帮你把ngram也做了，不用自己再额外做ngram。
具体可以参考http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
tfidf = TfidfVectorizer(ngram_range=(1,4),analyzer='word',stop_words='english',lowercase=True)

In [109]:
tfidf.fit(data_combine.text.values.tolist())

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [111]:
from sklearn.decomposition import TruncatedSVD

In [112]:
svd = TruncatedSVD(n_components=25,algorithm='arpack')

In [142]:
svd_matrix = svd.fit_transform(tfidf.transform(data_combine.text.values.tolist()))

In [159]:
svd_feature = pd.DataFrame(svd_matrix)

In [161]:
svd_feature.columns = ['svd_word_1to4gram_'+str(i)for i in range(25)]

In [173]:
data_combine = data_combine.reset_index()

In [175]:
data_combine = pd.concat([data_combine,svd_feature],axis=1)

In [177]:
# 接下来做一个字母层次的tfidf的特征
tfidf_char = TfidfVectorizer(analyzer='char',lowercase=True,stop_words='english',ngram_range=(1,5))
tfidf_char.fit(data_combine.text.values.tolist())
svd_char = TruncatedSVD(n_components=25,algorithm='randomized')
svd_char_matrix = svd.fit_transform(tfidf_char.transform(data_combine.text.values.tolist()))
svd_char_feature = pd.DataFrame(svd_char_matrix)

In [180]:
svd_char_feature.columns = ['svd_char_1to5gram_'+str(i) for i in range(25)]

In [183]:
data_combine = pd.concat([data_combine,svd_char_feature],axis=1)

In [187]:
data_combine = data_combine.drop('index',axis=1)

关于pandas保存为csv可以查看官方文档：http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html

In [189]:
data_combine.to_csv(filepath+'Dec_3.csv',index=False)

In [190]:
data_combine

Unnamed: 0,author,id,text,word_split,text_len,symbol_num,"symbol_num_,",symbol_num_.,symbol_num_?,symbol_num_:,...,svd_char_1to5gram_15,svd_char_1to5gram_16,svd_char_1to5gram_17,svd_char_1to5gram_18,svd_char_1to5gram_19,svd_char_1to5gram_20,svd_char_1to5gram_21,svd_char_1to5gram_22,svd_char_1to5gram_23,svd_char_1to5gram_24
0,EAP,id26305,"This process, however, afforded me no means of...","[This, process, ,, however, ,, afforded, me, n...",48,5,4,1,0,0,...,-0.008009,0.013331,-0.050801,0.006386,0.017673,0.035414,-0.008974,-0.045741,-0.002864,0.001790
1,HPL,id17569,It never once occurred to me that the fumbling...,"[It, never, once, occurred, to, me, that, the,...",15,1,0,1,0,0,...,0.004292,0.020517,-0.021551,-0.043966,-0.062726,0.080991,-0.026666,-0.058964,0.028587,-0.001092
2,EAP,id11008,"In his left hand was a gold snuff box, from wh...","[In, his, left, hand, was, a, gold, snuff, box...",41,5,4,1,0,0,...,-0.012883,-0.048039,0.006840,-0.007042,0.000872,0.003402,-0.011036,0.034528,-0.027592,-0.022302
3,MWS,id27763,How lovely is spring As we looked from Windsor...,"[How, lovely, is, spring, As, we, looked, from...",38,4,3,1,0,0,...,-0.017996,-0.024845,0.009133,0.029233,0.016250,-0.040987,-0.008578,0.044852,-0.006246,0.016567
4,HPL,id12958,"Finding nothing else, not even gold, the Super...","[Finding, nothing, else, ,, not, even, gold, ,...",31,3,2,1,0,0,...,0.012136,-0.002706,-0.013356,0.020759,0.024944,0.050988,-0.059477,-0.005122,0.062051,-0.009140
5,MWS,id22965,"A youth passed in solitude, my best years spen...","[A, youth, passed, in, solitude, ,, my, best, ...",90,6,4,1,0,1,...,0.006400,0.000287,-0.025934,0.004998,0.012725,-0.004631,-0.041827,-0.027229,0.023744,0.062683
6,EAP,id09674,"The astronomer, perhaps, at this point, took r...","[The, astronomer, ,, perhaps, ,, at, this, poi...",26,4,3,1,0,0,...,-0.005245,-0.001049,-0.024920,-0.009983,0.000037,-0.014420,0.050893,0.031562,0.002735,-0.044457
7,EAP,id13515,The surcingle hung in ribands from my body.,"[The, surcingle, hung, in, ribands, from, my, ...",9,1,0,1,0,0,...,-0.020562,0.009405,0.045517,0.067985,-0.023899,-0.022398,-0.025774,0.021835,0.005835,-0.057799
8,EAP,id19322,I knew that you could not say to yourself 'ste...,"[I, knew, that, you, could, not, say, to, your...",98,9,7,1,0,0,...,0.014211,0.079369,0.004396,0.023093,0.016395,0.015513,-0.069362,0.023756,0.026296,0.011625
9,MWS,id00912,I confess that neither the structure of langua...,"[I, confess, that, neither, the, structure, of...",26,3,2,1,0,0,...,0.052917,0.044159,0.003808,0.013754,0.005612,0.004226,0.014261,0.043140,-0.085298,0.042077
