### 这部分对训练集和测试集做相同的预处理。
大概可以理解为特征工程

In [1]:
import pandas as pd
filepath = 'C:/Users\Administrator\Desktop/'

In [2]:
train_data = pd.read_csv(filepath+'train.csv')
test_data = pd.read_csv(filepath+'test.csv')

In [3]:
data_combine = pd.concat((train_data,test_data),axis=0)

In [4]:
data_combine.describe()

Unnamed: 0,author,id,text
count,19579,27971,27971
unique,3,27971,27971
top,EAP,id06223,A tub had caught all ha ha When I had made an ...
freq,7900,1,1


训练集也没有缺失值

In [5]:
import nltk

In [6]:
data_combine['word_split'] = data_combine.text.apply(lambda x:nltk.word_tokenize(x))

In [7]:
# 添加第一个特征：句子长度
%time
data_combine['text_len'] = data_combine.word_split.apply(lambda x:len(x))

Wall time: 0 ns


In [9]:
# 添加第二个特征：标点符号个数
symbol = [',','.','?',':',"'"]
def get_symbol_num(x):
    s = 0
    for i in x:
        if i in symbol:
            s+=1
    return s
data_combine['symbol_num'] = data_combine.word_split.apply(lambda x:get_symbol_num(x))

In [10]:
# 添加第3-7个特征：各个标点符号的个数
def get_each_symbol_num(x,i):
    s = 0
    for j in x:
        if j==i:
            s+=1
    return s
for i in symbol:
    data_combine['symbol_num_'+i] = data_combine.word_split.apply(lambda x:get_each_symbol_num(x,i))

In [12]:
from nltk import WordNetLemmatizer

In [13]:
# 添加第8特征：stopword词个数
%time
from nltk.corpus import stopwords
stopword = stopwords.words()
def get_num(x,y):
    s = 0
    for i in x:
        if i.lower() in y or WordNetLemmatizer().lemmatize(i.lower()) in y:
            s+=1
    return s
data_combine['stopword_num'] = data_combine.word_split.apply(lambda x:get_num(x,stopword))

Wall time: 0 ns


In [14]:
# 添加第9个特征：单词平均长度
def avg(x):
    s = 0
    l = 0
    for i in x:
        if i not in symbol:
            s+=len(i)
            l+=1
    return s/l
data_combine['word_mean_length'] = data_combine.word_split.apply(lambda x:avg(x))
%time

Wall time: 0 ns


In [15]:
# 添加第10个特征：字母个数
def char_sum(x):
    s = 0
    for i in x:
        if i not in symbol:
            s+=len(i)
    return s
data_combine['char_num'] = data_combine.word_split.apply(lambda x:char_sum(x))

In [16]:
# 添加第11个特征：不重复出现的单词个数
def unique(x):
    a = []
    for i in x:
        if i not in symbol:
            a.append(i)
    return(len(set(a)))
data_combine['unique_word'] = data_combine.word_split.apply(lambda x:unique(x))

接下来要处理挖掘出来的那些有区分性的单词，目前有下面这么三种想法
* 直接把其中有代表性的单词取出来做一个one-hot型的变量
* 把这些单词做一次SVD后放进去  
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
* 先用一个简单模型针对这些单词特征做一次训练，把结果作为最终模型的特征  
我还在想用哪一种方法，或许在使用集成学习的时候我们可以这样不同的操作数据集。  

我先做tfidf  
这里要用一个sklearn里面的tf-idf模型，这个模型的好处在于它可以自动帮你把ngram也做了，不用自己再额外做ngram。
具体可以参考http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tfidf = TfidfVectorizer(ngram_range=(1,4),analyzer='word',stop_words='english',lowercase=True)

In [19]:
tfidf.fit(data_combine.text.values.tolist())

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [20]:
from sklearn.decomposition import TruncatedSVD

In [21]:
svd = TruncatedSVD(n_components=25,algorithm='arpack')

In [22]:
svd_matrix = svd.fit_transform(tfidf.transform(data_combine.text.values.tolist()))

In [23]:
svd_feature = pd.DataFrame(svd_matrix)

In [24]:
svd_feature.columns = ['svd_word_1to4gram_'+str(i)for i in range(25)]

In [25]:
data_combine = data_combine.reset_index()

In [26]:
data_combine = pd.concat([data_combine,svd_feature],axis=1)

In [27]:
# 接下来做一个字母层次的tfidf的特征
tfidf_char = TfidfVectorizer(analyzer='char',lowercase=True,stop_words='english',ngram_range=(1,5))
tfidf_char.fit(data_combine.text.values.tolist())
svd_char = TruncatedSVD(n_components=25,algorithm='randomized')
svd_char_matrix = svd.fit_transform(tfidf_char.transform(data_combine.text.values.tolist()))
svd_char_feature = pd.DataFrame(svd_char_matrix)

In [28]:
svd_char_feature.columns = ['svd_char_1to5gram_'+str(i) for i in range(25)]

In [29]:
data_combine = pd.concat([data_combine,svd_char_feature],axis=1)

In [30]:
data_combine = data_combine.drop('index',axis=1)

关于pandas保存为csv可以查看官方文档：http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html

In [31]:
data_combine.to_csv(filepath+'Dec_3.csv',index=False,encoding='utf-8')

In [33]:
writer = pd.ExcelWriter('D:/大学/大三上/数据挖掘/作业/数据.xlsx')

In [34]:
data_combine.to_excel(writer,sheet_name='sheet1')

In [35]:
writer.save()

In [3]:
data_combine = pd.read_excel('D:/大学/大三上/数据挖掘/作业/数据.xlsx')

In [17]:
# 目前有61个特征，但还有很多特征没有做
# 先把情感分析的统计特征加进去
# 首先是句子的整体情感
from afinn import Afinn
af = Afinn()

In [18]:
data_combine['sentence_sentiment'] = data_combine.text.apply(lambda x:af.score(x))

In [88]:
# 然后是句子中国积极词汇与消极词汇出现的数量
def pos_word(x):
    a = af.scores(x)
    s = 0
    for i in a:
        if i>0:
            s+=1
    return s
def neg_word(x):
    a =af.scores(x)
    s = 0
    for i in a:
        if i<0:
            s+=1
    return s
data_combine['pos_word_num'] = data_combine.text.apply(lambda x:pos_word(x))
data_combine['neg_word_num'] = data_combine.text.apply(lambda x:neg_word(x))

In [91]:
# 最消极的程度和最积极的程度
def pos_word_max(x):
    a = af.scores(x)
    a.append(0)
    return max(a)
def neg_word_min(x):
    a =af.scores(x)
    a.append(0)
    return min(a)
data_combine['pos_word_max'] = data_combine.text.apply(lambda x:pos_word_max(x))
data_combine['neg_word_min'] = data_combine.text.apply(lambda x:neg_word_min(x))

In [23]:
# 然后做一下topic model
# 就这么个东西李德茏要做一个礼拜，我真的很佩服他的效率
# 用sklearn帮我们集成好的lda模型，具体可以参考其官方文档
# 使用lda前要做一个词频分析
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
count = CountVectorizer(stop_words='english',lowercase=True)

In [37]:
count.fit(data_combine.word_split)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [38]:
len(count.get_feature_names())

27981

In [40]:
count_matrix = count.transform(data_combine.word_split)

In [48]:
# 直接打印出来要memory out
lda = LatentDirichletAllocation(n_topics=10,learning_method='online')

In [52]:
lda_matrix = lda.fit_transform(count_matrix)

In [65]:
lda_feature = pd.DataFrame(lda_matrix)

In [69]:
# 分类效果还行还可以，大部分的样本概率最大的topic概率都有七八十的样子

In [93]:
lda_feature.columns = ['lda_topic_'+str(i) for i in range(10)]

In [96]:
data_combine = pd.concat([data_combine,lda_feature],axis=1)

In [98]:
writer = pd.ExcelWriter('D:/大学/大三上/数据挖掘/作业/数据.xlsx')
data_combine.to_excel(writer,sheet_name='sheet1')
writer.save()

#### 现在还缺点内容
* 李德茏做的分词性挖掘出来的结果怎么放到特征里面去
* 我一开始做的整个文档的结果怎么放进去
* 还有一些自然语言处理的方法没用过，比如说句法分析，关系分析，实体识别，这个还要再做做
* 考虑要不要用一些词向量、深度学习的方法试试，比如玩玩LSTM

In [20]:
data_combine = pd.read_excel('D:/大学/大三上/数据挖掘/作业/数据.xlsx')

In [23]:
# 跑了第一个模型，效果很不好
# 所以现在打算用朴素贝叶斯做预处理
# 这里的基本思路是：用一个朴素贝叶斯模型，训练集是tfidf矩阵，target是各个类的概率
from sklearn.naive_bayes import MultinomialNB

In [24]:
def run_by(train_x,train_y,test_x,test_y,test_x2):
    model = MultinomialNB()
    model.fit(train_x,train_y)
    pred_test_y = model.predict_proba(test_x)
    preb_test_y2 = model.predict_proba(test_x2)
    return pred_test_y,preb_test_y2,model

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,4),analyzer='word',stop_words='english',lowercase=True)

In [27]:
train_data = data_combine[:-8392]

In [29]:
test_data = data_combine[-8392:]

In [47]:
full_idf = tfidf.fit_transform(data_combine.text.values.tolist())
train_idf = tfidf.transform(train_data.text.values.tolist())
test_idf = tfidf.transform(test_data.text.values.tolist())

In [33]:
import numpy as np

In [34]:
cv_score = []
pred_full_test = 0
pred_train = np.zeros([train_data.shape[0],3])

In [35]:
from sklearn.model_selection import KFold

In [37]:
kf = KFold(n_splits=5,shuffle=True,random_state=2017)

In [38]:
train_x = train_data.drop('author',axis=1)

In [41]:
train_y = train_data.author

In [43]:
from sklearn import metrics

In [48]:
for dev_index,val_index in kf.split(train_x):
    dev_x,val_x = train_idf[dev_index],train_idf[val_index]
    dev_y,val_y = train_y[dev_index],train_y[val_index]
    pred_val_y,pred_test_y,model = run_by(dev_x,dev_y,val_x,val_y,test_idf)
    pred_full_test+=pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_score.append(metrics.log_loss(val_y,pred_val_y))
print(cv_score)
pred_full_test/=5

[0.88109724661754707, 0.87723870483987865, 0.88246036021359808, 0.87634263587307826, 0.87639386339749181]


In [52]:
train_data['by_tfidf_word_eap'] = pred_train[:,0]
train_data['by_tfidf_word_hpl'] = pred_train[:,1]
train_data['by_tfidf_word_mws'] = pred_train[:,2]
test_data['by_tfidf_word_eap'] = pred_full_test[:,0]
test_data['by_tfidf_word_hpl'] = pred_full_test[:,1]
test_data['by_tfidf_word_mws'] = pred_full_test[:,2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [53]:
# 同样，做好word层面后做一个char层面

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,4),analyzer='char',stop_words='english',lowercase=True)

In [55]:
full_idf = tfidf.fit_transform(data_combine.text.values.tolist())
train_idf = tfidf.transform(train_data.text.values.tolist())
test_idf = tfidf.transform(test_data.text.values.tolist())

In [57]:
cv_score = []
pred_full_test = 0
pred_train = np.zeros([train_data.shape[0],3])
kf = KFold(n_splits=5,shuffle=True,random_state=2017)
for dev_index,val_index in kf.split(train_x):
    dev_x,val_x = train_idf[dev_index],train_idf[val_index]
    dev_y,val_y = train_y[dev_index],train_y[val_index]
    pred_val_y,pred_test_y,model = run_by(dev_x,dev_y,val_x,val_y,test_idf)
    pred_full_test+=pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_score.append(metrics.log_loss(val_y,pred_val_y))
print(cv_score)
pred_full_test/=5

[0.70211920696551455, 0.68191341833830266, 0.69790023142233037, 0.69562676626576303, 0.69512648035234725]


In [58]:
train_data['by_tfidf_char_eap'] = pred_train[:,0]
train_data['by_tfidf_char_hpl'] = pred_train[:,1]
train_data['by_tfidf_char_mws'] = pred_train[:,2]
test_data['by_tfidf_char_eap'] = pred_full_test[:,0]
test_data['by_tfidf_char_hpl'] = pred_full_test[:,1]
test_data['by_tfidf_char_mws'] = pred_full_test[:,2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [60]:
# 对countvector也做一个这个操作

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
count  = CountVectorizer(analyzer='word',lowercase=True,ngram_range=(1,3))

In [64]:
count.fit(data_combine.text.values.tolist())

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [65]:
train_count = count.transform(train_data.text.values.tolist())
test_count = count.transform(test_data.text.values.tolist())

In [66]:
cv_score = []
pred_full_test = 0
pred_train = np.zeros([train_data.shape[0],3])
kf = KFold(n_splits=5,shuffle=True,random_state=2017)
for dev_index,val_index in kf.split(train_x):
    dev_x,val_x = train_count[dev_index],train_count[val_index]
    dev_y,val_y = train_y[dev_index],train_y[val_index]
    pred_val_y,pred_test_y,model = run_by(dev_x,dev_y,val_x,val_y,test_count)
    pred_full_test+=pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_score.append(metrics.log_loss(val_y,pred_val_y))
print(cv_score)
pred_full_test/=5

[0.67738403706157613, 0.78175612930143956, 0.75400063435281883, 0.78682637971473457, 0.75586724930106453]


In [67]:
train_data['by_count_word_eap'] = pred_train[:,0]
train_data['by_count_word_hpl'] = pred_train[:,1]
train_data['by_count_word_mws'] = pred_train[:,2]
test_data['by_count_word_eap'] = pred_full_test[:,0]
test_data['by_count_word_hpl'] = pred_full_test[:,1]
test_data['by_count_word_mws'] = pred_full_test[:,2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
count  = CountVectorizer(analyzer='char',lowercase=True,ngram_range=(1,3))
count.fit(data_combine.text.values.tolist())
train_count = count.transform(train_data.text.values.tolist())
test_count = count.transform(test_data.text.values.tolist())

In [70]:
cv_score = []
pred_full_test = 0
pred_train = np.zeros([train_data.shape[0],3])
kf = KFold(n_splits=5,shuffle=True,random_state=2017)
for dev_index,val_index in kf.split(train_x):
    dev_x,val_x = train_count[dev_index],train_count[val_index]
    dev_y,val_y = train_y[dev_index],train_y[val_index]
    pred_val_y,pred_test_y,model = run_by(dev_x,dev_y,val_x,val_y,test_count)
    pred_full_test+=pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_score.append(metrics.log_loss(val_y,pred_val_y))
print(cv_score)
pred_full_test/=5
train_data['by_count_char_eap'] = pred_train[:,0]
train_data['by_count_char_hpl'] = pred_train[:,1]
train_data['by_count_char_mws'] = pred_train[:,2]
test_data['by_count_char_eap'] = pred_full_test[:,0]
test_data['by_count_char_hpl'] = pred_full_test[:,1]
test_data['by_count_char_mws'] = pred_full_test[:,2]

[1.8422550366914596, 1.7678322954591918, 1.8539994021281196, 1.9187541928282406, 1.8531154296526455]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [73]:
data_combine = pd.concat([train_data,test_data],axis=0)

In [75]:
writer = pd.ExcelWriter('D:/大学/大三上/数据挖掘/作业/数据.xlsx')
data_combine.to_excel(writer,sheet_name='sheet1')
writer.save()

In [1]:
import pandas as pd

In [2]:
data_combine = pd.read_excel('D:/大学/大三上/数据挖掘/作业/数据.xlsx')

In [149]:
data_item = pd.read_excel('D:/大学/大三上/数据挖掘/作业/word_feature.xlsx')

In [17]:
data_combine = data_combine.drop(['id','text','word_split'],axis=1)

In [8]:
data_item['author'] = data_combine.author

In [62]:
author = {'EAP':0,
         'HPL':1,
         'MWS':2}

In [63]:
train_data = data_item[:-8392]

In [64]:
test_data = data_item[-8392:]

In [65]:
train_y = train_data['author'].map(author)

In [66]:
train_x = train_data.drop('author',axis=1)

In [67]:
test_y = test_data.drop('author',axis=1)

In [68]:
# 今天要做的是继续用基本模型利用其中几个特征产生预测值，就继续用朴素贝叶斯吧
# 先把李德茏整理的word_feature 做了一个
# 然后做一些统计上的特征

In [69]:
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

In [76]:
def run_mnb(train_x,train_y,test_x,test_y,test_x2):
    model = MLPClassifier()
    model.fit(train_x,train_y)
    pred1 = model.predict_proba(test_x)
    pred2 = model.predict_proba(test_x2)
    return pred1,pred2,model

In [77]:
from sklearn.model_selection import KFold
import numpy as np
from sklearn import metrics

In [78]:
kf = KFold(n_splits=5,random_state=2017,shuffle=True)
cv = []
pre = 0
pre_train = np.zeros([train_x.shape[0],3])


In [79]:
train_x = train_x.values
train_y = train_y.values
test_y = test_y.values

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [80]:
train_x

array([[ 0.00588238,  0.00588235,  0.00588251, ...,  0.00588239,
         0.00588254,  0.00588235],
       [ 0.02000086,  0.02      ,  0.02      , ...,  0.02      ,
         0.02      ,  0.02      ],
       [ 0.05500065,  0.00500005,  0.85261894, ...,  0.005     ,
         0.005     ,  0.00500009],
       ..., 
       [ 0.01111111,  0.12222208,  0.01111111, ...,  0.01111266,
         0.14125091,  0.32540966],
       [ 0.01428984,  0.01428571,  0.01428571, ...,  0.30000272,
         0.01428604,  0.29999637],
       [ 0.0125    ,  0.0125    ,  0.15555345, ...,  0.0125    ,
         0.24071495,  0.26250048]])

In [81]:
for dev_index,val_index in kf.split(train_x):
    dev_x,val_x = train_x[dev_index],train_x[val_index]
    dev_y,val_y = train_y[dev_index],train_y[val_index]
    pred_val_y,pred_test_y,model = run_mnb(dev_x,dev_y,val_x,val_y,test_y)
    pre+=pred_test_y
    pre_train[val_index,:] = pred_val_y
    cv.append(metrics.log_loss(val_y,pred_val_y))
print(cv)
pre/=5

[1.0551373918173772, 1.0466514151678172, 1.0487534362862112, 1.0464802013172874, 1.0445034981521393]


In [52]:
train_data = data_combine[:-8392]
test_data = data_combine[-8392:]

In [53]:
train_data['eap_svd_word_feature'] = pre_train[:,0]
train_data['hpl_svd_word_feature'] = pre_train[:,1]
train_data['mws_svd_word_feature'] = pre_train[:,2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [54]:
test_data['eap_svd_word_feature'] = pre[:,0]
test_data['hpl_svd_word_feature'] = pre[:,1]
test_data['mws_svd_word_feature'] = pre[:,2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [55]:
data_combine = pd.concat([train_data,test_data],axis=0)

In [None]:
writer = pd.ExcelWriter('D:/大学/大三上/数据挖掘/作业/数据.xlsx')
data_combine.to_excel(writer,sheet_name='sheet1')
writer.save()

In [7]:
import pandas as pd
data_combine = pd.read_excel('D:/大学/大三上/数据挖掘/作业/final_data.xlsx')

In [3]:
rnntext = pd.read_csv('C:/Users\Administrator/Desktop/rnntext_pred.csv')

In [6]:
data_combine = data_combine.drop(['fasttext_'+str(i) for i in range(3)],axis=1)

In [7]:
data_combine = data_combine.drop(['cnntext_'+str(i) for i in range(3)],axis=1)

In [8]:
for i in range(3):
    data_combine['fasttext_'+str(i)] = fasttext[str(i)].values

In [3]:
for i in range(3):
    data_combine['cnntext_'+str(i)] = cnntext[str(i)].values

In [4]:
for i in range(3):
    data_combine['rnntext_'+str(i)] = rnntext[str(i)].values

In [57]:
data_item = pd.DataFrame(data_combine.author)

In [6]:
data_combine.lda_topic_0

Index(['author', 'id', 'text', 'word_split', 'text_len', 'symbol_num',
       'symbol_num_,', 'symbol_num_.', 'symbol_num_?', 'symbol_num_:',
       ...
       'by_count_char_mws', 'eap_word_feature', 'hpl_word_feature',
       'mws_word_feature', 'eap_statistic_feature', 'hpl_statistic_feature',
       'mws_statistic_feature', 'eap_embed_feature', 'hpl_embed_feature',
       'mws_embed_feature'],
      dtype='object', length=101)

In [60]:
for i in range(10):
    data_item['lda_topic'+str(i)] = data_combine['lda_topic_'+str(i)].values

In [2]:
data_embed = pd.read_excel('D:/大学/大三上/数据挖掘/作业/数据embed.xlsx')

In [4]:
data_item = data_embed

In [30]:
from sklearn import svm
svc = svm.SVC(probability=True)

In [1]:
import pandas as pd

In [14]:
a = pd.DataFrame([1,2,3])

In [16]:
a.columns(['haha'],axis=1)

TypeError: 'RangeIndex' object is not callable

In [7]:
b =a.values

In [3]:
test_ = pd.read_excel('D:/大学\大三上\数据挖掘\作业/test4cc.xlsx')

In [1]:
import pandas as pd

In [2]:
train_ = pd.read_excel('D:/大学\大三上\数据挖掘\作业/train4cc.xlsx')

In [4]:
len(train_)

19579

In [8]:
a = list(data_combine.columns)

In [10]:
b = []
for i in a:
    if 'eap' not in i and 'hpl' not in i and 'mws' not in i:
        b.append(i)

In [11]:
b.remove('author')

In [12]:
for i in range(3):
    b.remove('fasttext_'+str(i))
    b.remove('cnntext_'+str(i))
    b.remove('rnntext_'+str(i))

In [14]:
for i in range(len(b)-1):
    data_combine[b[i]+'&'+b[i+1]] = data_combine[b[i]]*data_combine[b[i+1]]

In [None]:
writer = pd.ExcelWriter('D:/大学/大三上/数据挖掘/作业/final_data2.xlsx')
data_combine.to_excel(writer,sheet_name='sheet1')
writer.save()