# 机器学习方法
## 特征二：共享关键词指数

In [159]:
import pandas as pd
import numpy as np
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import jieba
import jieba.analyse
import matplotlib.pyplot as plt
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer 

### 读取数据和之前计算的出的文本相似度

In [13]:
stop_words = set(stopwords.words('english'))
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')
train_similarity=pd.read_csv('train_set_similarity.csv')
test_similarity=pd.read_csv('test_set_similarity.csv')

In [38]:
train_sim_list=[]
for i,row in train_similarity.iterrows():
    sim_str=row['similarity']
    if(sim_str=='0'):
        sim_float=0.0
    else:
        sim_float=float(sim_str[2:-2])
    train_sim_list.append(sim_float)

test_sim_list=[]
for i,row in test_similarity.iterrows():
    sim_str=row['similarity']
    if(sim_str=='0'):
        sim_float=0.0
    else:
        sim_float=float(sim_str[2:-2])
    test_sim_list.append(sim_float)

### 在对分词结果进行清洗的时候，利用wordnet进行词干还原

In [167]:
#传入的参数sentence是句子经过tokenize后的token组成的列表
wtlem = WordNetLemmatizer()
def clean_sentence(sentence):
    sentence = filter(lambda x: len(x) > 1, sentence)
    word_list=[]
    for word in sentence:
        word=word.lower()
        word=wtlem.lemmatize(word,'v')   #动词词干还原
        if word not in stop_words:    #过滤没有意义停用词
                if word != '\t':
                    word_list.append(word)
    result=" ".join(word_list)
    return result


### 进行关键词提取，question中提取5个关键词，sentence中提取30个关键词

In [168]:
#处理训练集
train_question_key_word_list=[]
train_sentence_key_word_list=[]

train_question_len_list=[]
train_sentence_len_list=[]

KEY_WORD_NUM_QUESTION=5
KEY_WORD_NUM_SENTENCE=30
for i,row in train_data.iterrows():
    question_tokenized=word_tokenize(row['question'])
    sentence_tokenized=word_tokenize(row['sentence'])
    
    train_question_len_list.append(len(question_tokenized))
    train_sentence_len_list.append(len(sentence_tokenized))
    
    question_cleaned=clean_sentence(question_tokenized)
    sentence_cleaned=clean_sentence(sentence_tokenized)

    
    question_key_word=jieba.analyse.extract_tags(question_cleaned,topK=KEY_WORD_NUM_QUESTION)
    sentence_key_word=jieba.analyse.extract_tags(sentence_cleaned,topK=KEY_WORD_NUM_SENTENCE)
    
    train_question_key_word_list.append(question_key_word)
    train_sentence_key_word_list.append(sentence_key_word)

In [169]:
#处理测试集
test_question_key_word_list=[]
test_sentence_key_word_list=[]

for i,row in test_data.iterrows():
    question_tokenized=word_tokenize(row['question'])
    sentence_tokenized=word_tokenize(row['sentence'])
    
    question_cleaned=clean_sentence(question_tokenized)
    sentence_cleaned=clean_sentence(sentence_tokenized)

    
    question_key_word=jieba.analyse.extract_tags(question_cleaned,topK=KEY_WORD_NUM_QUESTION)
    sentence_key_word=jieba.analyse.extract_tags(sentence_cleaned,topK=KEY_WORD_NUM_SENTENCE)
    
    test_question_key_word_list.append(question_key_word)
    test_sentence_key_word_list.append(sentence_key_word)

In [28]:
train_labels=list(train_data.label)
test_labels=list(test_data.label)

In [80]:
for i in range(5):
    print("-"*8)
    print("label:{}".format(train_labels[i]))
    print("smaple {} question key word:".format(i))
    print(train_question_key_word_list[i])
    print("smaple {} sentence key word:".format(i))
    print(train_sentence_key_word_list[i])


--------
label:0
smaple 0 question key word:
['third', 'digimon', 'series']
smaple 0 sentence key word:
['seasons', 'digimon', 'unlike', 'two', 'followed', 'tamers', 'takes', 'darker', 'realistic', 'approach', 'story', 'featuring', 'reincarnate', 'deaths', 'complex', 'character', 'development', 'original', 'japanese']
--------
label:0
smaple 1 question key word:
['famous', 'palace', 'located']
smaple 1 sentence key word:
['london', 'westminster', 'greenwich', 'contains', 'four', 'world', 'heritage', 'sites', 'tower', 'kew', 'gardens', 'site', 'comprising', 'palace', 'abbey', 'st', 'margaret', 'church', 'historic', 'settlement']
--------
label:0
smaple 2 question key word:
['starred', 'true', 'love']
smaple 2 sentence key word:
['show', 'starred', 'ted', 'danson', 'dr', 'john', 'becker', 'doctor', 'operated', 'small', 'practice', 'constantly', 'annoyed', 'patients', 'co', 'workers', 'friends', 'practically', 'everything', 'everybody']
--------
label:0
smaple 3 question key word:
['open'

### 计算“共享关键词指数”，利用wordnet考虑近义词

In [175]:
train_common_index=[]
for i in range(len(train_question_key_word_list)):
    question_key_words=train_question_key_word_list[i]
    sentence_key_words=train_sentence_key_word_list[i]
    common_list=[word for word in question_key_words if word in sentence_key_words]
    
    #考虑有没有近义词
    if(len(common_list)==0):
        for word in question_key_words:
            for ss in wn.synsets(word):
                for sentence_word in sentence_key_words:
                    if sentence_word in ss.lemma_names():
                        common_list.append(word)
        if(len(common_list)>KEY_WORD_NUM_QUESTION):
            common_list=common_list[:KEY_WORD_NUM_QUESTION]
    
    common_index=float(len(common_list)/KEY_WORD_NUM_QUESTION)
    train_common_index.append(common_index)
    
test_common_index=[]
for i in range(len(test_question_key_word_list)):
    question_key_words=test_question_key_word_list[i]
    sentence_key_words=test_sentence_key_word_list[i]
    common_list=[word for word in question_key_words if word in sentence_key_words]
    
    #考虑有没有近义词
    if(len(common_list)==0):
        for word in question_key_words:
            for ss in wn.synsets(word):
                for sentence_word in sentence_key_words:
                    if sentence_word in ss.lemma_names():
                        common_list.append(word)
        if(len(common_list)>KEY_WORD_NUM_QUESTION):
            common_list=common_list[:KEY_WORD_NUM_QUESTION]
            
    common_index=float(len(common_list)/KEY_WORD_NUM_QUESTION)
    test_common_index.append(common_index)

In [176]:
print(len(train_common_index))
print(len(train_sim_list))

8000
8000


### 准备数据

In [177]:
train_dic={'common_index':train_common_index,'similarity':train_sim_list}
test_dic={'common_index':test_common_index,'similarity':test_sim_list}
X=pd.DataFrame(train_dic)
y_label=np.array(train_labels)
X_test=pd.DataFrame(test_dic)

In [84]:
from sklearn.tree import DecisionTreeClassifier  #决策树
from sklearn.linear_model import LogisticRegression #logistic回归
from sklearn.svm import SVC, LinearSVC   #SVC
from sklearn.ensemble import RandomForestClassifier  #随机森林
from sklearn.naive_bayes import GaussianNB   #朴素贝叶斯

### 模型训练并输出预测结果

### Logistic回归

In [178]:
LR = LogisticRegression()
##训练
LR.fit(X,y_label)
##预测
LR_prediction = LR.predict(X_test)
output1 = pd.DataFrame({'label': LR_prediction})

pred=np.array(output1.label)
ground_truth=np.array(test_labels)
res = (ground_truth == pred)
acc = res.sum()/len(res)
print("logistic regression准确率：{}".format(acc))

logistic regression准确率：0.74625




### 决策树

In [179]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X, y_label)
DT_prediction = decision_tree.predict(X_test)
output2 = pd.DataFrame({ 'label': DT_prediction})

pred=np.array(output2.label)
ground_truth=np.array(test_labels)
res = (ground_truth == pred)
acc = res.sum()/len(res)
print("决策树准确率：{}".format(acc))

决策树准确率：0.654375


### SVC

In [180]:
svc = SVC()
svc.fit(X, y_label)
SVC_prediction = svc.predict(X_test)
output3 = pd.DataFrame({'label':SVC_prediction})

pred=np.array(output3.label)
ground_truth=np.array(test_labels)
res = (ground_truth == pred)
acc = res.sum()/len(res)
print("SVC准确率：{}".format(acc))



SVC准确率：0.75


### 随机森林

In [181]:
RFCmodel = RandomForestClassifier(n_estimators=20, max_depth=2, random_state=1)
RFCmodel.fit(X, y_label)
predictions = RFCmodel.predict(X_test)
output4 = pd.DataFrame({'label': predictions})

pred=np.array(output4.label)
ground_truth=np.array(test_labels)
res = (ground_truth == pred)
acc = res.sum()/len(res)
print("RFC准确率：{}".format(acc))

RFC准确率：0.738125


### 朴素贝叶斯

In [182]:
clf = GaussianNB()
clf.fit(X, y_label)
predictions = clf.predict(X_test)
output5 = pd.DataFrame({'label': predictions})

pred=np.array(output5.label)
ground_truth=np.array(test_labels)
res = (ground_truth == pred)
acc = res.sum()/len(res)
print("朴素贝叶斯准确率：{}".format(acc))

朴素贝叶斯准确率：0.733125


### 输出最好的结果到prediction.csv中

In [183]:
myfile=open('prediction.csv','w',encoding='utf-8')
for i,row in output3.iterrows():
    print(row['label'],file=myfile)
#output4.to_csv('C:/Users/xx/Desktop/SVC_submission.csv', index=False)