In [1]:
import pandas as pd
import numpy as np
import jieba

In [2]:

# load data and data prepocessing
news = pd.read_csv('sqlResult.csv', encoding = 'gb18030')

In [3]:
print('news shape: ', news.shape)

news shape:  (89611, 7)


In [4]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89611 entries, 0 to 89610
Data columns (total 7 columns):
id         89611 non-null int64
author     79396 non-null object
source     89609 non-null object
content    87054 non-null object
feature    89611 non-null object
title      89577 non-null object
url        87144 non-null object
dtypes: int64(1), object(6)
memory usage: 4.8+ MB


In [5]:
# check nan
print('check nan in content or not: ', True in news.content.isna().values)
news = news.dropna(subset=['content'])
print('check nan in content or not: ', True in news.content.isna().values)

check nan in content or not:  True
check nan in content or not:  False


In [6]:
# load stop words
with open('chinese_stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = [i[:-1] for i in file.readlines()]

In [7]:
# Word Segmentation
def split_text(text):
    text1 = text.replace(' ', '').replace('\n', '')
    text2 = jieba.cut(text1)
    result = ' '.join([w for w in text2 if w not in stopwords])
    return result

In [9]:
# create corpus
corpus = list(map(split_text, [str(i) for i in news.content]))
print(len(corpus))

87054


In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [12]:
# calculate corpus tfidf
countvectorizer = CountVectorizer(encoding='gb18030', min_df=0.015)
tfidftransformer = TfidfTransformer()
countvector = countvectorizer.fit_transform(corpus)
tfidf = tfidftransformer.fit_transform(countvector)

In [13]:
# get news label
label = list(map(lambda source: 1 if '新华社' in str(source) else 0, news.source))
label

[0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [14]:
from sklearn.model_selection import train_test_split
# train test dataset split
X_train, X_test, Y_train, Y_test = train_test_split(tfidf.toarray(), label, test_size = 0.3, random_state = 233)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, Y_train)
y_predict = clf.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print('Predict accuracy: ', accuracy_score(y_predict, Y_test))
print('Predict precision: ', precision_score(y_predict, Y_test))
print('Predict recall: ', recall_score(y_predict, Y_test))

Predict accuracy:  0.8814948118084006
Predict precision:  0.9040463895708118
Predict recall:  0.9626808491458962


In [16]:
# use model to do style predict
prediction = clf.predict(tfidf.toarray())
labels = np.array(label)
compare_news_index = pd.DataFrame({'prediction': prediction, 'labels': labels})
plagiarism_index = compare_news_index[(compare_news_index['prediction'] == 1) & (compare_news_index['labels']==0)]

In [17]:
xinhuashe_index = compare_news_index[compare_news_index.labels == 1].index
print('可能抄袭的新闻条数：', len(plagiarism_index))

可能抄袭的新闻条数： 2796


In [18]:
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans

normalizer = Normalizer()
scaled_array = normalizer.fit_transform(tfidf.toarray())

kmeans = KMeans(n_clusters=25)
k_labels = kmeans.fit_predict(scaled_array)
print(k_labels.shape)


(87054,)


In [19]:
id_class = {index: class_ for index, class_ in enumerate(k_labels)}
from collections import defaultdict
class_id = defaultdict(set)
for index, class_ in id_class.items():
    if index in xinhuashe_index.tolist():
        class_id[class_].add(index)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
def find_similar_text(cpindex, top=10):
    dist_dict = {i: cosine_similarity(tfidf[cpindex], tfidf[i]) for i in class_id[id_class[cpindex]]}
    sorted_dist_dict = sorted(dist_dict.items(), key=lambda x:x[1][0], reverse=True)
    return sorted_dist_dict[:top]


In [21]:
cp_index = 3352
cp_list = find_similar_text(cp_index)
cp_list

[(3134, array([[0.96849134]])),
 (63511, array([[0.94643198]])),
 (29441, array([[0.94283416]])),
 (3218, array([[0.87621892]])),
 (29615, array([[0.86936328]])),
 (29888, array([[0.86215862]])),
 (64046, array([[0.85278235]])),
 (29777, array([[0.84875422]])),
 (63974, array([[0.73415212]])),
 (63975, array([[0.73415212]]))]

In [22]:
print(news.iloc[cp_index].content)

　　中国5月份56座城市新建商品住宅价格环比上涨，4月份为58座上涨。5月份15个一线和热点二线城市房地产市场基本稳定，5月份房地产调控政策效果继续显现。
　　统计局：15个一线和热点二线城市房价同比涨幅全部回落
　　国家统计局城市司高级统计师刘建伟解读5月份房价数据
　　5月份一二线城市房价平均涨幅继续回落
　　国家统计局今日发布了2017年5月份70个大中城市住宅销售价格统计数据。对此，国家统计局城市司高级统计师刘建伟进行了解读。
　　一、15个一线和热点二线城市新建商品住宅价格同比涨幅全部回落、9个城市环比下降或持平
　　5月份，因地制宜、因城施策的房地产调控政策效果继续显现，15个一线和热点二线城市房地产市场基本稳定。从同比看，15个城市新建商品住宅价格涨幅均比上月回落，回落幅度在0.5至6.4个百分点之间。从环比看，9个城市新建商品住宅价格下降或持平；5个城市涨幅在0.5%以内。
　　二、70个大中城市中一二线城市房价同比涨幅持续回落
　　5月份，70个城市中新建商品住宅和二手住宅价格同比涨幅比上月回落的城市分别有29和18个。其中，一二线城市同比涨幅回落尤其明显。据测算，一线城市新建商品住宅和二手住宅价格同比涨幅均连续8个月回落，5月份比4月份分别回落2.2和1.7个百分点；二线城市新建商品住宅和二手住宅价格同比涨幅分别连续6个月和4个月回落，5月份比4月份分别回落0.8和0.5个百分点。
　　三、70个大中城市中房价环比下降及涨幅回落城市个数均有所增加
　　5月份，70个城市中新建商品住宅价格环比下降的城市有9个，比上月增加1个；涨幅回落的城市有26个，比上月增加3个。二手住宅价格环比下降的城市有7个，比上月增加2个；涨幅回落的城市有30个，比上月增加8个。



In [23]:
print(news.iloc[cp_list[0][0]].content)

　　国家统计局19日发布数据，5月份，15个一线和热点二线城市新建商品住宅价格同比涨幅全部回落，其中9个城市环比下降或持平。这9个价格环比下降或持平的城市为：北京、上海、南京、杭州、合肥、福州、郑州、深圳、成都。
　　“5月份，因地制宜、因城施策的房地产调控政策效果继续显现，15个一线和热点二线城市房地产市场基本稳定。”国家统计局城市司高级统计师刘建伟说，从同比看，15个城市新建商品住宅价格涨幅均比上月回落，回落幅度在0.5至6.4个百分点之间。从环比看，9个城市新建商品住宅价格下降或持平；5个城市涨幅在0.5%以内。
　　国家统计局当天还发布了5月份70个大中城市住宅销售价格统计数据。刘建伟介绍，5月份，70个大中城市中新建商品住宅和二手住宅价格同比涨幅比上月回落的城市分别有29和18个。其中，一二线城市同比涨幅回落尤其明显。据测算，一线城市新建商品住宅和二手住宅价格同比涨幅均连续8个月回落，5月份比4月份分别回落2.2和1.7个百分点；二线城市新建商品住宅和二手住宅价格同比涨幅分别连续6个月和4个月回落，5月份比4月份分别回落0.8和0.5个百分点。
　　此外，70个大中城市中房价环比下降及涨幅回落城市个数均有所增加。统计显示，5月份，70个大中城市中新建商品住宅价格环比下降的城市有9个，比上月增加1个；涨幅回落的城市有26个，比上月增加3个。二手住宅价格环比下降的城市有7个，比上月增加2个；涨幅回落的城市有30个，比上月增加8个。

