# 机器学习与社会科学应用

# 第四章 自然语言处理入门

# 第四节 文本相似度

<font face="宋体" >郭峰    
    教授、博士生导师  
上海财经大学公共经济与管理学院  
上海财经大学数实融合与智能治理实验室    
    邮箱：guofengsfi@163.com</font> 

<font face="宋体" >本节目录  
4.1.导入数据  
4.2.将关键词处理成自定义词典  
4.3.分词处理  
4.4.建立语料库  
4.5.文本相似度计算</font> 

## 4.1. 导入数据

In [None]:
import pandas as pd
import numpy as np
import datetime
starttime = datetime.datetime.now()

path = "D:/python/机器学习与社会科学应用/演示数据/04自然语言处理入门/tfidf相似度计算/"
cssci = pd.read_csv(path+"cssci_clean_test.csv",encoding='utf-8')
print(cssci.shape)


# 计算主题模型时，需要将标题、关键词和摘要合并
cssci['keyword'] = cssci['keyword'].fillna(";")
cssci['content'] = cssci['title']+";"+cssci['keyword']+";"+cssci['abstract']
cssci=cssci[cssci['content'].str.len()>100]   # 将标题+关键词+摘要少于100字的样本删除
print("标题+关键词+摘要少于100字的样本删除后数量:"+str(len(cssci))) # 查看行*列数
cssci.to_csv(path+'cssci_clean_short.csv',encoding='utf8',index=False)
print(cssci.shape)
print(cssci.year.min())
endtime = datetime.datetime.now()
print((endtime - starttime).seconds)
cssci.head()

## 4.2. 将关键词处理成自定义词典

In [None]:
# 根据关键词为分词准备自定义词典
import jieba
import pandas as pd
import numpy as np
import datetime
starttime = datetime.datetime.now()

path =  "D:/python/机器学习与社会科学应用/演示数据//04自然语言处理入门/tfidf相似度计算/"

cssci = pd.read_csv(path+"cssci_clean_short.csv",encoding='utf-8')
# cssci = cssci[0:10000]
print(cssci.shape)

# 去掉一些关键词较为特殊的样本
# 关键词不能为空，且长度不超过30字符，早期系统自动识别的关键词数量较多
cssci = cssci[cssci['kwnum']<6]
cssci = cssci[cssci['keyword'].str.len()>1]
cssci = cssci[cssci['keyword'].str.len()<30]

keyword = cssci[['keyword']]
print(keyword[0:20])
print("包含正常关键词的论文数量："+str(len(keyword)))

# 一行变多行
keyword = keyword['keyword'].str.split(';', expand=True).stack()
keyword.to_csv(path+'keyword.csv',encoding='utf8',index=False)
f2 = open(path+"keyword.csv",encoding='utf-8')
keyword = pd.read_csv(f2,header=0,sep=',')
keyword.rename(columns={'0':'keyword'}, inplace = True)


# 删除空值
keyword = keyword.dropna() 
print(keyword[0:20])

print("关键词累计总数量："+str(len(keyword)))


# 去掉一些过长或者过短的关键词
keyword = keyword[keyword['keyword'].str.len()>1]
keyword = keyword[keyword['keyword'].str.len()<7]

print("剔除过长过短关键词后数量："+str(len(keyword)))

# 统计关键词重复出现的次数
group1 = keyword.groupby(['keyword'])
keyword_count = pd.DataFrame(columns=["keyword_count"])
keyword_count['keyword_count'] = group1['keyword'].count()
keyword_count.to_csv(path+'keyword_count.csv',encoding='utf8')

f = open(path+"keyword_count.csv",encoding='utf-8')
keyword_count = pd.read_csv(f,header=0,sep=',')
keyword = pd.merge(keyword,keyword_count,how='left')

# 删除重复值
keyword.drop_duplicates(subset=['keyword'],keep='first',inplace=True) 
print("删除重复后的关键词个数：",len(keyword))

keyword.to_csv(path+'keyword_count.csv',encoding='utf8',index=False)
keyword = keyword[keyword['keyword_count']>1]
keyword = keyword[['keyword']]
print("剔除仅出现1次的关键词后数量："+str(len(keyword)))
keyword.to_csv(path+'keyword.txt',encoding='utf8',index=False,header=False)

endtime = datetime.datetime.now()
print((endtime - starttime).seconds)

## 4.3. 分词处理

In [None]:
# 重新生成关键词词典
# 自定义词典格式：词 词频 词性（可省略）
from collections import Counter

path = "D:/python/机器学习与社会科学应用/演示数据/04自然语言处理入门/tfidf相似度计算/"

keywords = open(path+"keyword.txt", encoding='utf8').read()
keywords = keywords.strip().split('\n')
keywords = dict(Counter(keywords))
with open(path+'keywords.txt','w',encoding='utf8') as f:
    for key, value in keywords.items():
        ele = key + " " + str(value) + '\n'
        f.write(ele)

In [None]:
# 分词，全部运行要一段时间
import jieba
import jieba.posseg as pseg
import pandas as pd
import re
import numpy as np
import datetime
starttime = datetime.datetime.now()


path = "D:/python/机器学习与社会科学应用/演示数据/04自然语言处理入门/tfidf相似度计算/"
cssci = pd.read_csv(path+"cssci_clean_short.csv",encoding='utf-8')
# cssci=cssci[0:100]

# 把停用词做成字典
jieba.load_userdict(path+"keywords.txt") # 加载自定义词典
stopwords = {}
fstop = open(path+'stopword.txt', 'r')
for eachWord in fstop:
    stopwords[eachWord.strip()] = eachWord.strip()
fstop.close()

#切词的函数
def word_cut(x):
    line = x['content'].strip()
    line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;；:-【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", "",line)
    wordList = list(jieba.cut(line1)) # 用结巴分词，对每行内容进行分词  
    outStr = ''  
    for word in wordList:
        if word not in stopwords:  
            outStr += word  
            outStr += ' '  
    return outStr
cssci['cut_out'] = cssci.apply(word_cut, axis=1)

    
print(cssci['title'][0])
print(cssci['cut_out'][0])
cssci["cutlength"] = cssci['cut_out'].str.len()
cssci = cssci[cssci['cutlength'] >2] # 分词之后，部分出现空值等异常现象

cut_out = cssci[['cut_out']]
cssci.to_csv(path+'cssci_title_cut.csv',encoding='utf8',index=False)
cut_out.to_csv(path+'cut_out.csv',encoding='utf8')
print(cssci.shape)
endtime = datetime.datetime.now()
print((endtime - starttime).seconds)

## 4.4. 建立语料库

In [None]:
from gensim import corpora,models,similarities
from collections import  defaultdict
import pandas as pd
import re
import numpy as np
import datetime
starttime = datetime.datetime.now()

# 函数：建立语料库
def get_dict(cutwords):
    #print(cutwords[0])
    texts = [cutword.split() for cutword in cutwords]
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    texts = [ [ token for token in text if frequency[token] > 5 ] for text in texts]
    dictionary = corpora.Dictionary(texts) 
    corpus = [dictionary.doc2bow(text) for text in texts]
    # print(corpus[0])
    return dictionary,corpus


# 导入数据，已经完成了分词模式
path = "D:/python/机器学习与社会科学应用/演示数据/04自然语言处理入门/tfidf相似度计算/"
cssci = pd.read_csv(path+"cssci_title_cut.csv", encoding='utf-8')
# cssci=cssci[0:100]
print("cssci样本量：", len(cssci))

# 计算dictionary,corpu
cutwords = cssci['cut_out']
dictionary,corpus = get_dict(cutwords)
tfidf = models.TfidfModel(corpus)

print("dictionary样本量：", len(dictionary))

# 模型结果保存
tfidf.save(path+"model.tfidf")
dictionary.save(path+'dictionary_tfidf.dict')  # 保存生成的词典


endtime = datetime.datetime.now()
print((endtime - starttime).seconds)


## 4.5.文本相似度计算

In [None]:
# 这里的相似度是计算某个文章与上年所有top 5%论文的相似度，求其最大值；
from gensim import corpora,models,similarities
from collections import  defaultdict
import pandas as pd
import re
import numpy as np
import datetime
starttime = datetime.datetime.now()

def tfidf_sim(text1,text2,dictionary):
    # 文档1
    # text1 = text1.split()
    text1 = [cutword.split() for cutword in text1]
    # print(text1[0])
    # corpus1 = dictionary.doc2bow(text1)  # 文档转换成bow
    corpus1 = [dictionary.doc2bow(text) for text in text1]  # 文档转换成bow
    # print(corpus1[0])
    # corpus1 = [corpus1]
    text1_tfidf = tfidf[corpus1]
    tfidf_sim = similarities.SparseMatrixSimilarity(text1_tfidf, num_features=len(dictionary.keys()))

    # 文档2
    text2 = text2.split()
    # print(text2)
    corpus2 = dictionary.doc2bow(text2)  # 文档转换成bow
    text2_tfidf = tfidf[corpus2]
    sim = tfidf_sim[text2_tfidf]
    sim2 = sorted(enumerate(sim), key=lambda item: -item[1])
    # print(sim2[0])
    return sim2[0][0],sim2[0][1]   #sim2是一个元组组成的列表，第一个为最大值及其对应的序号，详见上文第一小节

    
# 导入数据，已经完成了分词模式
path = "D:/python/机器学习与社会科学应用/演示数据/04自然语言处理入门/tfidf相似度计算/"
f = open(path+"cssci_title_cut.csv", encoding='utf-8')
cssci = pd.read_csv(f,header=0, sep=',')
print("样本量：", len(cssci))

# 计算dictionary,corpu
tfidf = models.TfidfModel.load(path+"model.tfidf")
dictionary = corpora.Dictionary.load(path+'dictionary_tfidf.dict')  # 加载

cssci['sim'] = ""
cssci['nearest_title'] = ""

cssci_2001 = cssci[cssci.year==2001]
cssci_new = cssci_2001   

for year in range(2002,2018):
    cssci_highcited = cssci[cssci['year']==year-1]
    cssci_highcited['cp95'] = cssci_highcited['cited'].quantile(0.95)
    cssci_highcited = cssci_highcited[cssci_highcited['cited']>=cssci_highcited['cp95']]
    cssci_highcited['index'] = range(cssci_highcited.shape[0])  # 之前的index序号不连贯了,重新整理
    cssci_highcited.set_index('index',inplace=True)
    cssci_nextyear = cssci[cssci['year']==year]
    cssci_nextyear['index'] = range(cssci_nextyear.shape[0])  # 之前的index序号不连贯了,重新整理
    cssci_nextyear.set_index('index',inplace=True)
    # cssci_nextyear=cssci_nextyear[0:10]
    text1 = cssci_highcited['cut_out']
    # 计算某年论文与上一年top5%最相似论文
    def fun1(x):
        text2 = x['cut_out']
        j,sim = tfidf_sim(text1,text2,dictionary)
        x['sim'] = sim
        x['nearest_title'] = cssci_highcited['title'][j]
        return x  
    cssci_nextyear = cssci_nextyear.apply(fun1, axis=1)
    print(cssci_nextyear['sim'][0:10])
    print(cssci_nextyear.title[0:10],cssci_nextyear.nearest_title[0:10])
    cssci_new = cssci_new.append(cssci_nextyear) 

cssci_new.to_csv(path+'cssci_sim_tfidf.csv',encoding='utf8')

cssci_new_short = cssci_new[['tlength','mag_name','mag_city_code','aunum','author_first','aufw','cited','download','fund01','fundn','page_num','year_period','year','month','kwnum','ablength','page_beg','sim']] 
cssci_new_short.to_csv(path+'cssci_sim_tfidf_short.csv',encoding='utf8',index=False)
endtime = datetime.datetime.now()
print((endtime - starttime).seconds)

In [None]:
# 本节结束