### 这部分对训练集和测试集做相同的预处理。
大概可以理解为特征工程

In [2]:
import pandas as pd
filepath = 'C:/Users\Administrator\Desktop/'

In [2]:
train_data = pd.read_csv(filepath+'train.csv')
test_data = pd.read_csv(filepath+'test.csv')

In [3]:
data_combine = pd.concat((train_data,test_data),axis=0)

In [4]:
data_combine.describe()

Unnamed: 0,author,id,text
count,19579,27971,27971
unique,3,27971,27971
top,EAP,id06223,A tub had caught all ha ha When I had made an ...
freq,7900,1,1


训练集也没有缺失值

In [5]:
import nltk

In [6]:
data_combine['word_split'] = data_combine.text.apply(lambda x:nltk.word_tokenize(x))

In [7]:
# 添加第一个特征：句子长度
%time
data_combine['text_len'] = data_combine.word_split.apply(lambda x:len(x))

Wall time: 0 ns


In [9]:
# 添加第二个特征：标点符号个数
symbol = [',','.','?',':',"'"]
def get_symbol_num(x):
    s = 0
    for i in x:
        if i in symbol:
            s+=1
    return s
data_combine['symbol_num'] = data_combine.word_split.apply(lambda x:get_symbol_num(x))

In [10]:
# 添加第3-7个特征：各个标点符号的个数
def get_each_symbol_num(x,i):
    s = 0
    for j in x:
        if j==i:
            s+=1
    return s
for i in symbol:
    data_combine['symbol_num_'+i] = data_combine.word_split.apply(lambda x:get_each_symbol_num(x,i))

In [12]:
from nltk import WordNetLemmatizer

In [13]:
# 添加第8特征：stopword词个数
%time
from nltk.corpus import stopwords
stopword = stopwords.words()
def get_num(x,y):
    s = 0
    for i in x:
        if i.lower() in y or WordNetLemmatizer().lemmatize(i.lower()) in y:
            s+=1
    return s
data_combine['stopword_num'] = data_combine.word_split.apply(lambda x:get_num(x,stopword))

Wall time: 0 ns


In [14]:
# 添加第9个特征：单词平均长度
def avg(x):
    s = 0
    l = 0
    for i in x:
        if i not in symbol:
            s+=len(i)
            l+=1
    return s/l
data_combine['word_mean_length'] = data_combine.word_split.apply(lambda x:avg(x))
%time

Wall time: 0 ns


In [15]:
# 添加第10个特征：字母个数
def char_sum(x):
    s = 0
    for i in x:
        if i not in symbol:
            s+=len(i)
    return s
data_combine['char_num'] = data_combine.word_split.apply(lambda x:char_sum(x))

In [16]:
# 添加第11个特征：不重复出现的单词个数
def unique(x):
    a = []
    for i in x:
        if i not in symbol:
            a.append(i)
    return(len(set(a)))
data_combine['unique_word'] = data_combine.word_split.apply(lambda x:unique(x))

接下来要处理挖掘出来的那些有区分性的单词，目前有下面这么三种想法
* 直接把其中有代表性的单词取出来做一个one-hot型的变量
* 把这些单词做一次SVD后放进去  
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
* 先用一个简单模型针对这些单词特征做一次训练，把结果作为最终模型的特征  
我还在想用哪一种方法，或许在使用集成学习的时候我们可以这样不同的操作数据集。  

我先做tfidf  
这里要用一个sklearn里面的tf-idf模型，这个模型的好处在于它可以自动帮你把ngram也做了，不用自己再额外做ngram。
具体可以参考http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tfidf = TfidfVectorizer(ngram_range=(1,4),analyzer='word',stop_words='english',lowercase=True)

In [19]:
tfidf.fit(data_combine.text.values.tolist())

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [20]:
from sklearn.decomposition import TruncatedSVD

In [21]:
svd = TruncatedSVD(n_components=25,algorithm='arpack')

In [22]:
svd_matrix = svd.fit_transform(tfidf.transform(data_combine.text.values.tolist()))

In [23]:
svd_feature = pd.DataFrame(svd_matrix)

In [24]:
svd_feature.columns = ['svd_word_1to4gram_'+str(i)for i in range(25)]

In [25]:
data_combine = data_combine.reset_index()

In [26]:
data_combine = pd.concat([data_combine,svd_feature],axis=1)

In [27]:
# 接下来做一个字母层次的tfidf的特征
tfidf_char = TfidfVectorizer(analyzer='char',lowercase=True,stop_words='english',ngram_range=(1,5))
tfidf_char.fit(data_combine.text.values.tolist())
svd_char = TruncatedSVD(n_components=25,algorithm='randomized')
svd_char_matrix = svd.fit_transform(tfidf_char.transform(data_combine.text.values.tolist()))
svd_char_feature = pd.DataFrame(svd_char_matrix)

In [28]:
svd_char_feature.columns = ['svd_char_1to5gram_'+str(i) for i in range(25)]

In [29]:
data_combine = pd.concat([data_combine,svd_char_feature],axis=1)

In [30]:
data_combine = data_combine.drop('index',axis=1)

关于pandas保存为csv可以查看官方文档：http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html

In [31]:
data_combine.to_csv(filepath+'Dec_3.csv',index=False,encoding='utf-8')

In [33]:
writer = pd.ExcelWriter('D:/大学/大三上/数据挖掘/作业/数据.xlsx')

In [34]:
data_combine.to_excel(writer,sheet_name='sheet1')

In [35]:
writer.save()

In [32]:
data_combine

Unnamed: 0,author,id,text,word_split,text_len,symbol_num,"symbol_num_,",symbol_num_.,symbol_num_?,symbol_num_:,...,svd_char_1to5gram_15,svd_char_1to5gram_16,svd_char_1to5gram_17,svd_char_1to5gram_18,svd_char_1to5gram_19,svd_char_1to5gram_20,svd_char_1to5gram_21,svd_char_1to5gram_22,svd_char_1to5gram_23,svd_char_1to5gram_24
0,EAP,id26305,"This process, however, afforded me no means of...","[This, process, ,, however, ,, afforded, me, n...",48,5,4,1,0,0,...,-0.008009,0.013331,-0.050801,0.006386,0.017673,0.035414,-0.008974,-0.045741,-0.002864,0.001790
1,HPL,id17569,It never once occurred to me that the fumbling...,"[It, never, once, occurred, to, me, that, the,...",15,1,0,1,0,0,...,0.004292,0.020517,-0.021551,-0.043966,-0.062726,0.080991,-0.026666,-0.058964,0.028587,-0.001092
2,EAP,id11008,"In his left hand was a gold snuff box, from wh...","[In, his, left, hand, was, a, gold, snuff, box...",41,5,4,1,0,0,...,-0.012883,-0.048039,0.006840,-0.007042,0.000872,0.003402,-0.011036,0.034528,-0.027592,-0.022302
3,MWS,id27763,How lovely is spring As we looked from Windsor...,"[How, lovely, is, spring, As, we, looked, from...",38,4,3,1,0,0,...,-0.017996,-0.024845,0.009133,0.029233,0.016250,-0.040987,-0.008578,0.044852,-0.006246,0.016567
4,HPL,id12958,"Finding nothing else, not even gold, the Super...","[Finding, nothing, else, ,, not, even, gold, ,...",31,3,2,1,0,0,...,0.012136,-0.002706,-0.013356,0.020759,0.024944,0.050988,-0.059477,-0.005122,0.062051,-0.009140
5,MWS,id22965,"A youth passed in solitude, my best years spen...","[A, youth, passed, in, solitude, ,, my, best, ...",90,6,4,1,0,1,...,0.006400,0.000287,-0.025934,0.004998,0.012725,-0.004631,-0.041827,-0.027229,0.023744,0.062683
6,EAP,id09674,"The astronomer, perhaps, at this point, took r...","[The, astronomer, ,, perhaps, ,, at, this, poi...",26,4,3,1,0,0,...,-0.005245,-0.001049,-0.024920,-0.009983,0.000037,-0.014420,0.050893,0.031562,0.002735,-0.044457
7,EAP,id13515,The surcingle hung in ribands from my body.,"[The, surcingle, hung, in, ribands, from, my, ...",9,1,0,1,0,0,...,-0.020562,0.009405,0.045517,0.067985,-0.023899,-0.022398,-0.025774,0.021835,0.005835,-0.057799
8,EAP,id19322,I knew that you could not say to yourself 'ste...,"[I, knew, that, you, could, not, say, to, your...",98,9,7,1,0,0,...,0.014211,0.079369,0.004396,0.023093,0.016395,0.015513,-0.069362,0.023756,0.026296,0.011625
9,MWS,id00912,I confess that neither the structure of langua...,"[I, confess, that, neither, the, structure, of...",26,3,2,1,0,0,...,0.052917,0.044159,0.003808,0.013754,0.005612,0.004226,0.014261,0.043140,-0.085298,0.042077


In [3]:
data_combine = pd.read_excel('D:/大学/大三上/数据挖掘/作业/数据.xlsx')

In [17]:
# 目前有61个特征，但还有很多特征没有做
# 先把情感分析的统计特征加进去
# 首先是句子的整体情感
from afinn import Afinn
af = Afinn()

In [18]:
data_combine['sentence_sentiment'] = data_combine.text.apply(lambda x:af.score(x))

In [88]:
# 然后是句子中国积极词汇与消极词汇出现的数量
def pos_word(x):
    a = af.scores(x)
    s = 0
    for i in a:
        if i>0:
            s+=1
    return s
def neg_word(x):
    a =af.scores(x)
    s = 0
    for i in a:
        if i<0:
            s+=1
    return s
data_combine['pos_word_num'] = data_combine.text.apply(lambda x:pos_word(x))
data_combine['neg_word_num'] = data_combine.text.apply(lambda x:neg_word(x))

In [91]:
# 最消极的程度和最积极的程度
def pos_word_max(x):
    a = af.scores(x)
    a.append(0)
    return max(a)
def neg_word_min(x):
    a =af.scores(x)
    a.append(0)
    return min(a)
data_combine['pos_word_max'] = data_combine.text.apply(lambda x:pos_word_max(x))
data_combine['neg_word_min'] = data_combine.text.apply(lambda x:neg_word_min(x))

In [23]:
# 然后做一下topic model
# 就这么个东西李德茏要做一个礼拜，我真的很佩服他的效率
# 用sklearn帮我们集成好的lda模型，具体可以参考其官方文档
# 使用lda前要做一个词频分析
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
count = CountVectorizer(stop_words='english',lowercase=True)

In [37]:
count.fit(data_combine.word_split)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [38]:
len(count.get_feature_names())

27981

In [40]:
count_matrix = count.transform(data_combine.word_split)

In [48]:
# 直接打印出来要memory out
lda = LatentDirichletAllocation(n_topics=10,learning_method='online')

In [52]:
lda_matrix = lda.fit_transform(count_matrix)

In [65]:
lda_feature = pd.DataFrame(lda_matrix)

In [69]:
# 分类效果还行还可以，大部分的样本概率最大的topic概率都有七八十的样子

In [93]:
lda_feature.columns = ['lda_topic_'+str(i) for i in range(10)]

In [96]:
data_combine = pd.concat([data_combine,lda_feature],axis=1)

In [98]:
writer = pd.ExcelWriter('D:/大学/大三上/数据挖掘/作业/数据.xlsx')
data_combine.to_excel(writer,sheet_name='sheet1')
writer.save()

#### 现在还缺点内容
* 李德茏做的分词性挖掘出来的结果怎么放到特征里面去
* 我一开始做的整个文档的结果怎么放进去
* 还有一些自然语言处理的方法没用过，比如说句法分析，关系分析，实体识别，这个还要再做做
* 考虑要不要用一些词向量、深度学习的方法试试，比如玩玩LSTM

In [20]:
data_combine = pd.read_excel('D:/大学/大三上/数据挖掘/作业/数据.xlsx')

In [23]:
# 跑了第一个模型，效果很不好
# 所以现在打算用朴素贝叶斯做预处理
# 这里的基本思路是：用一个朴素贝叶斯模型，训练集是tfidf矩阵，target是各个类的概率
from sklearn.naive_bayes import MultinomialNB

In [24]:
def run_by(train_x,train_y,test_x,test_y,test_x2):
    model = MultinomialNB()
    model.fit(train_x,train_y)
    pred_test_y = model.predict_proba(test_x)
    preb_test_y2 = model.predict_proba(test_x2)
    return pred_test_y,preb_test_y2,model

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,4),analyzer='word',stop_words='english',lowercase=True)

In [27]:
train_data = data_combine[:-8392]

In [29]:
test_data = data_combine[-8392:]

In [47]:
full_idf = tfidf.fit_transform(data_combine.text.values.tolist())
train_idf = tfidf.transform(train_data.text.values.tolist())
test_idf = tfidf.transform(test_data.text.values.tolist())

In [33]:
import numpy as np

In [34]:
cv_score = []
pred_full_test = 0
pred_train = np.zeros([train_data.shape[0],3])

In [35]:
from sklearn.model_selection import KFold

In [37]:
kf = KFold(n_splits=5,shuffle=True,random_state=2017)

In [38]:
train_x = train_data.drop('author',axis=1)

In [41]:
train_y = train_data.author

In [43]:
from sklearn import metrics

In [48]:
for dev_index,val_index in kf.split(train_x):
    dev_x,val_x = train_idf[dev_index],train_idf[val_index]
    dev_y,val_y = train_y[dev_index],train_y[val_index]
    pred_val_y,pred_test_y,model = run_by(dev_x,dev_y,val_x,val_y,test_idf)
    pred_full_test+=pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_score.append(metrics.log_loss(val_y,pred_val_y))
print(cv_score)
pred_full_test/=5

[0.88109724661754707, 0.87723870483987865, 0.88246036021359808, 0.87634263587307826, 0.87639386339749181]


In [52]:
train_data['by_tfidf_word_eap'] = pred_train[:,0]
train_data['by_tfidf_word_hpl'] = pred_train[:,1]
train_data['by_tfidf_word_mws'] = pred_train[:,2]
test_data['by_tfidf_word_eap'] = pred_full_test[:,0]
test_data['by_tfidf_word_hpl'] = pred_full_test[:,1]
test_data['by_tfidf_word_mws'] = pred_full_test[:,2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [53]:
# 同样，做好word层面后做一个char层面

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,4),analyzer='char',stop_words='english',lowercase=True)

In [55]:
full_idf = tfidf.fit_transform(data_combine.text.values.tolist())
train_idf = tfidf.transform(train_data.text.values.tolist())
test_idf = tfidf.transform(test_data.text.values.tolist())

In [57]:
cv_score = []
pred_full_test = 0
pred_train = np.zeros([train_data.shape[0],3])
kf = KFold(n_splits=5,shuffle=True,random_state=2017)
for dev_index,val_index in kf.split(train_x):
    dev_x,val_x = train_idf[dev_index],train_idf[val_index]
    dev_y,val_y = train_y[dev_index],train_y[val_index]
    pred_val_y,pred_test_y,model = run_by(dev_x,dev_y,val_x,val_y,test_idf)
    pred_full_test+=pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_score.append(metrics.log_loss(val_y,pred_val_y))
print(cv_score)
pred_full_test/=5

[0.70211920696551455, 0.68191341833830266, 0.69790023142233037, 0.69562676626576303, 0.69512648035234725]


In [58]:
train_data['by_tfidf_char_eap'] = pred_train[:,0]
train_data['by_tfidf_char_hpl'] = pred_train[:,1]
train_data['by_tfidf_char_mws'] = pred_train[:,2]
test_data['by_tfidf_char_eap'] = pred_full_test[:,0]
test_data['by_tfidf_char_hpl'] = pred_full_test[:,1]
test_data['by_tfidf_char_mws'] = pred_full_test[:,2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [59]:
train_data

Unnamed: 0,author,id,text,word_split,text_len,symbol_num,"symbol_num_,",symbol_num_.,symbol_num_?,symbol_num_:,...,lda_topic_6,lda_topic_7,lda_topic_8,lda_topic_9,by_tfidf_word_eap,by_tfidf_word_hpl,by_tfidf_word_mws,by_tfidf_char_eap,by_tfidf_char_hpl,by_tfidf_char_mws
0,EAP,id26305,"This process, however, afforded me no means of...","['This', 'process', ',', 'however', ',', 'affo...",48,5,4,1,0,0,...,0.947058,0.005882,0.005883,0.005882,0.590171,0.194790,0.215038,0.911745,0.018073,0.070182
1,HPL,id17569,It never once occurred to me that the fumbling...,"['It', 'never', 'once', 'occurred', 'to', 'me'...",15,1,0,1,0,0,...,0.819999,0.020000,0.020000,0.020000,0.508032,0.254644,0.237324,0.611434,0.205934,0.182633
2,EAP,id11008,"In his left hand was a gold snuff box, from wh...","['In', 'his', 'left', 'hand', 'was', 'a', 'gol...",41,5,4,1,0,0,...,0.005000,0.005000,0.005000,0.005000,0.566282,0.230919,0.202799,0.885806,0.087116,0.027078
3,MWS,id27763,How lovely is spring As we looked from Windsor...,"['How', 'lovely', 'is', 'spring', 'As', 'we', ...",38,4,3,1,0,0,...,0.105052,0.052380,0.004762,0.004762,0.302697,0.198534,0.498768,0.124630,0.081325,0.794045
4,HPL,id12958,"Finding nothing else, not even gold, the Super...","['Finding', 'nothing', 'else', ',', 'not', 'ev...",31,3,2,1,0,0,...,0.007143,0.007143,0.078570,0.007143,0.469153,0.274944,0.255904,0.758629,0.113109,0.128261
5,MWS,id22965,"A youth passed in solitude, my best years spen...","['A', 'youth', 'passed', 'in', 'solitude', ','...",90,6,4,1,0,1,...,0.056093,0.058702,0.002564,0.002564,0.415773,0.189599,0.394628,0.679247,0.022946,0.297807
6,EAP,id09674,"The astronomer, perhaps, at this point, took r...","['The', 'astronomer', ',', 'perhaps', ',', 'at...",26,4,3,1,0,0,...,0.179654,0.008333,0.008336,0.008333,0.497170,0.254863,0.247968,0.909058,0.062145,0.028797
7,EAP,id13515,The surcingle hung in ribands from my body.,"['The', 'surcingle', 'hung', 'in', 'ribands', ...",9,1,0,1,0,0,...,0.820000,0.020000,0.020000,0.020000,0.505804,0.249742,0.244455,0.645768,0.215366,0.138866
8,EAP,id19322,I knew that you could not say to yourself 'ste...,"['I', 'knew', 'that', 'you', 'could', 'not', '...",98,9,7,1,0,0,...,0.917226,0.002778,0.032773,0.002778,0.516912,0.259549,0.223539,0.803449,0.056310,0.140242
9,MWS,id00912,I confess that neither the structure of langua...,"['I', 'confess', 'that', 'neither', 'the', 'st...",26,3,2,1,0,0,...,0.009091,0.009091,0.009091,0.009091,0.396228,0.253260,0.350513,0.601809,0.094986,0.303206


In [60]:
# 对countvector也做一个这个操作

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
count  = CountVectorizer(analyzer='word',lowercase=True,ngram_range=(1,3))

In [64]:
count.fit(data_combine.text.values.tolist())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [65]:
train_count = count.transform(train_data.text.values.tolist())
test_count = count.transform(test_data.text.values.tolist())

In [66]:
cv_score = []
pred_full_test = 0
pred_train = np.zeros([train_data.shape[0],3])
kf = KFold(n_splits=5,shuffle=True,random_state=2017)
for dev_index,val_index in kf.split(train_x):
    dev_x,val_x = train_count[dev_index],train_count[val_index]
    dev_y,val_y = train_y[dev_index],train_y[val_index]
    pred_val_y,pred_test_y,model = run_by(dev_x,dev_y,val_x,val_y,test_count)
    pred_full_test+=pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_score.append(metrics.log_loss(val_y,pred_val_y))
print(cv_score)
pred_full_test/=5

[0.67738403706157613, 0.78175612930143956, 0.75400063435281883, 0.78682637971473457, 0.75586724930106453]


In [67]:
train_data['by_count_word_eap'] = pred_train[:,0]
train_data['by_count_word_hpl'] = pred_train[:,1]
train_data['by_count_word_mws'] = pred_train[:,2]
test_data['by_count_word_eap'] = pred_full_test[:,0]
test_data['by_count_word_hpl'] = pred_full_test[:,1]
test_data['by_count_word_mws'] = pred_full_test[:,2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
count  = CountVectorizer(analyzer='char',lowercase=True,ngram_range=(1,3))
count.fit(data_combine.text.values.tolist())
train_count = count.transform(train_data.text.values.tolist())
test_count = count.transform(test_data.text.values.tolist())

In [70]:
cv_score = []
pred_full_test = 0
pred_train = np.zeros([train_data.shape[0],3])
kf = KFold(n_splits=5,shuffle=True,random_state=2017)
for dev_index,val_index in kf.split(train_x):
    dev_x,val_x = train_count[dev_index],train_count[val_index]
    dev_y,val_y = train_y[dev_index],train_y[val_index]
    pred_val_y,pred_test_y,model = run_by(dev_x,dev_y,val_x,val_y,test_count)
    pred_full_test+=pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_score.append(metrics.log_loss(val_y,pred_val_y))
print(cv_score)
pred_full_test/=5
train_data['by_count_char_eap'] = pred_train[:,0]
train_data['by_count_char_hpl'] = pred_train[:,1]
train_data['by_count_char_mws'] = pred_train[:,2]
test_data['by_count_char_eap'] = pred_full_test[:,0]
test_data['by_count_char_hpl'] = pred_full_test[:,1]
test_data['by_count_char_mws'] = pred_full_test[:,2]

[1.8422550366914596, 1.7678322954591918, 1.8539994021281196, 1.9187541928282406, 1.8531154296526455]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [73]:
data_combine = pd.concat([train_data,test_data],axis=0)

In [74]:
data_combine

Unnamed: 0,author,id,text,word_split,text_len,symbol_num,"symbol_num_,",symbol_num_.,symbol_num_?,symbol_num_:,...,by_tfidf_word_mws,by_tfidf_char_eap,by_tfidf_char_hpl,by_tfidf_char_mws,by_count_word_eap,by_count_word_hpl,by_count_word_mws,by_count_char_eap,by_count_char_hpl,by_count_char_mws
0,EAP,id26305,"This process, however, afforded me no means of...","['This', 'process', ',', 'however', ',', 'affo...",48,5,4,1,0,0,...,0.215038,0.911745,0.018073,0.070182,1.000000e+00,3.754268e-16,4.529622e-13,9.999832e-01,4.532824e-14,1.676420e-05
1,HPL,id17569,It never once occurred to me that the fumbling...,"['It', 'never', 'once', 'occurred', 'to', 'me'...",15,1,0,1,0,0,...,0.237324,0.611434,0.205934,0.182633,9.984766e-01,3.186515e-04,1.204780e-03,5.273739e-01,3.949376e-01,7.768854e-02
2,EAP,id11008,"In his left hand was a gold snuff box, from wh...","['In', 'his', 'left', 'hand', 'was', 'a', 'gol...",41,5,4,1,0,0,...,0.202799,0.885806,0.087116,0.027078,9.999997e-01,2.771981e-07,2.471828e-14,1.000000e+00,4.353494e-08,8.418281e-18
3,MWS,id27763,How lovely is spring As we looked from Windsor...,"['How', 'lovely', 'is', 'spring', 'As', 'we', ...",38,4,3,1,0,0,...,0.498768,0.124630,0.081325,0.794045,2.289424e-11,4.796938e-12,1.000000e+00,1.176794e-15,5.973569e-07,9.999994e-01
4,HPL,id12958,"Finding nothing else, not even gold, the Super...","['Finding', 'nothing', 'else', ',', 'not', 'ev...",31,3,2,1,0,0,...,0.255904,0.758629,0.113109,0.128261,9.738588e-01,2.593364e-02,2.075770e-04,9.903712e-01,9.611206e-03,1.763350e-05
5,MWS,id22965,"A youth passed in solitude, my best years spen...","['A', 'youth', 'passed', 'in', 'solitude', ','...",90,6,4,1,0,1,...,0.394628,0.679247,0.022946,0.297807,9.212830e-07,1.563852e-22,9.999991e-01,7.474922e-12,3.449225e-28,1.000000e+00
6,EAP,id09674,"The astronomer, perhaps, at this point, took r...","['The', 'astronomer', ',', 'perhaps', ',', 'at...",26,4,3,1,0,0,...,0.247968,0.909058,0.062145,0.028797,9.999977e-01,2.120055e-06,1.378031e-07,9.999979e-01,2.113986e-06,4.268864e-13
7,EAP,id13515,The surcingle hung in ribands from my body.,"['The', 'surcingle', 'hung', 'in', 'ribands', ...",9,1,0,1,0,0,...,0.244455,0.645768,0.215366,0.138866,9.852025e-01,9.132334e-03,5.665146e-03,1.600190e-02,9.839343e-01,6.382331e-05
8,EAP,id19322,I knew that you could not say to yourself 'ste...,"['I', 'knew', 'that', 'you', 'could', 'not', '...",98,9,7,1,0,0,...,0.223539,0.803449,0.056310,0.140242,1.000000e+00,2.746565e-22,2.735421e-18,1.000000e+00,3.893061e-15,6.243710e-13
9,MWS,id00912,I confess that neither the structure of langua...,"['I', 'confess', 'that', 'neither', 'the', 'st...",26,3,2,1,0,0,...,0.350513,0.601809,0.094986,0.303206,5.609843e-04,2.505201e-03,9.969338e-01,9.865695e-01,2.045979e-05,1.341000e-02


In [75]:
writer = pd.ExcelWriter('D:/大学/大三上/数据挖掘/作业/数据.xlsx')
data_combine.to_excel(writer,sheet_name='sheet1')
writer.save()

In [1]:
import pandas as pd

In [2]:
data_combine = pd.read_excel('D:/大学/大三上/数据挖掘/作业/数据.xlsx')

In [149]:
data_item = pd.read_excel('D:/大学/大三上/数据挖掘/作业/word_feature.xlsx')

In [150]:
data_item = data_item.drop(['level_0','id','text','word_split','word_tag'],axis=1)

In [132]:
data_item['author'] = data_combine.author

In [151]:
author = {'EAP':0,
         'HPL':1,
         'MWS':2}

In [152]:
train_data = data_item[:-8392]

In [153]:
test_data = data_item[-8392:]

In [154]:
train_y = train_data['author'].map(author)

In [155]:
train_x = train_data.drop('author',axis=1)

In [156]:
test_y = test_data.drop('author',axis=1)

In [157]:
# 今天要做的是继续用基本模型利用其中几个特征产生预测值，就继续用朴素贝叶斯吧
# 先把李德茏整理的word_feature 做了一个
# 然后做一些统计上的特征

In [158]:
from sklearn.naive_bayes import MultinomialNB

In [159]:
def run_mnb(train_x,train_y,test_x,test_y,test_x2):
    model = MultinomialNB()
    model.fit(train_x,train_y)
    pred1 = model.predict_proba(test_x)
    pred2 = model.predict_proba(test_x2)
    return pred1,pred2,model

In [160]:
from sklearn.model_selection import KFold
import numpy as np
from sklearn import metrics

In [161]:
kf = KFold(n_splits=5,random_state=2017,shuffle=True)
cv = []
pre = 0
pre_train = np.zeros([train_x.shape[0],3])


In [162]:
train_x = train_x.values
train_y = train_y.values
test_y = test_y.values

In [163]:
train_x

array([[2, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [164]:
for dev_index,val_index in kf.split(train_x):
    dev_x,val_x = train_x[dev_index],train_x[val_index]
    dev_y,val_y = train_y[dev_index],train_y[val_index]
    pred_val_y,pred_test_y,model = run_mnb(dev_x,dev_y,val_x,val_y,test_y)
    pre+=pred_test_y
    pre_train[val_index,:] = pred_val_y
    cv.append(metrics.log_loss(val_y,pred_val_y))
print(cv)
pre/=5

[0.91266222616299886, 0.91607020144480078, 0.92785307702260389, 0.92310486184106622, 0.93206179622522256]


In [165]:
train_data = data_combine[:-8392]
test_data = data_combine[-8392:]

In [166]:
train_data['eap_word_feature'] = pre_train[:,0]
train_data['hpl_word_feature'] = pre_train[:,1]
train_data['mws_word_feature'] = pre_train[:,2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [167]:
test_data['eap_word_feature'] = pre[:,0]
test_data['hpl_word_feature'] = pre[:,1]
test_data['mws_word_feature'] = pre[:,2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [169]:
data_combine = pd.concat([train_data,test_data],axis=0)

In [170]:
data_combine

Unnamed: 0,author,id,text,word_split,text_len,symbol_num,"symbol_num_,",symbol_num_.,symbol_num_?,symbol_num_:,...,by_count_word_mws,by_count_char_eap,by_count_char_hpl,by_count_char_mws,eap_word_feature,hpl_word_feature,mws_word_feature,eap_statistic_feature,hpl_statistic_feature,mws_statistic_feature
0,EAP,id26305,"This process, however, afforded me no means of...","['This', 'process', ',', 'however', ',', 'affo...",48,5,4,1,0,0,...,4.529622e-13,9.999832e-01,4.532824e-14,1.676420e-05,0.765137,0.124716,0.110147,0.495548,0.220043,0.284409
1,HPL,id17569,It never once occurred to me that the fumbling...,"['It', 'never', 'once', 'occurred', 'to', 'me'...",15,1,0,1,0,0,...,1.204780e-03,5.273739e-01,3.949376e-01,7.768854e-02,0.344268,0.234484,0.421248,0.322568,0.387917,0.289516
2,EAP,id11008,"In his left hand was a gold snuff box, from wh...","['In', 'his', 'left', 'hand', 'was', 'a', 'gol...",41,5,4,1,0,0,...,2.471828e-14,1.000000e+00,4.353494e-08,8.418281e-18,0.508214,0.319893,0.171893,0.403640,0.094638,0.501722
3,MWS,id27763,How lovely is spring As we looked from Windsor...,"['How', 'lovely', 'is', 'spring', 'As', 'we', ...",38,4,3,1,0,0,...,1.000000e+00,1.176794e-15,5.973569e-07,9.999994e-01,0.001874,0.000444,0.997682,0.191998,0.093241,0.714761
4,HPL,id12958,"Finding nothing else, not even gold, the Super...","['Finding', 'nothing', 'else', ',', 'not', 'ev...",31,3,2,1,0,0,...,2.075770e-04,9.903712e-01,9.611206e-03,1.763350e-05,0.436685,0.284354,0.278961,0.207112,0.435558,0.357330
5,MWS,id22965,"A youth passed in solitude, my best years spen...","['A', 'youth', 'passed', 'in', 'solitude', ','...",90,6,4,1,0,1,...,9.999991e-01,7.474922e-12,3.449225e-28,1.000000e+00,0.353448,0.035371,0.611181,0.065293,0.054618,0.880089
6,EAP,id09674,"The astronomer, perhaps, at this point, took r...","['The', 'astronomer', ',', 'perhaps', ',', 'at...",26,4,3,1,0,0,...,1.378031e-07,9.999979e-01,2.113986e-06,4.268864e-13,0.460176,0.306677,0.233147,0.663913,0.172192,0.163895
7,EAP,id13515,The surcingle hung in ribands from my body.,"['The', 'surcingle', 'hung', 'in', 'ribands', ...",9,1,0,1,0,0,...,5.665146e-03,1.600190e-02,9.839343e-01,6.382331e-05,0.374976,0.207388,0.417636,0.450599,0.318549,0.230852
8,EAP,id19322,I knew that you could not say to yourself 'ste...,"['I', 'knew', 'that', 'you', 'could', 'not', '...",98,9,7,1,0,0,...,2.735421e-18,1.000000e+00,3.893061e-15,6.243710e-13,0.700760,0.261526,0.037715,0.441772,0.484418,0.073810
9,MWS,id00912,I confess that neither the structure of langua...,"['I', 'confess', 'that', 'neither', 'the', 'st...",26,3,2,1,0,0,...,9.969338e-01,9.865695e-01,2.045979e-05,1.341000e-02,0.501707,0.309911,0.188381,0.498428,0.253317,0.248255


In [171]:
writer = pd.ExcelWriter('D:/大学/大三上/数据挖掘/作业/数据.xlsx')
data_combine.to_excel(writer,sheet_name='sheet1')
writer.save()