In [1]:
## code comes from https://blog.csdn.net/john_bh/article/details/79268850
## 2019-10-31

In [1]:
#coding= utf-8
import pandas as pd
import random
import fasttext
import jieba
from sklearn.model_selection import train_test_split

In [5]:
# 建立label标签
cate_dic = {'technology': 1, 'car': 2, 'entertainment': 3, 'military': 4, 'sports': 5}

### 1.加载数据

In [6]:
def loadData():
     #利用pandas把数据读进来
    df_technology = pd.read_csv("./data/technology_news.csv",encoding ="utf-8")
    df_technology=df_technology.dropna()    #去空行处理

    df_car = pd.read_csv("./data/car_news.csv",encoding ="utf-8")
    df_car=df_car.dropna()

    df_entertainment = pd.read_csv("./data/entertainment_news.csv",encoding ="utf-8")
    df_entertainment=df_entertainment.dropna()

    df_military = pd.read_csv("./data/military_news.csv",encoding ="utf-8")
    df_military=df_military.dropna()

    df_sports = pd.read_csv("./data/sports_news.csv",encoding ="utf-8")
    df_sports=df_sports.dropna()

    technology=df_technology.content.values.tolist()[1000:21000]
    car=df_car.content.values.tolist()[1000:21000]
    entertainment=df_entertainment.content.values.tolist()[:20000]
    military=df_military.content.values.tolist()[:20000]
    sports=df_sports.content.values.tolist()[:20000]

    return technology,car,entertainment,military,sports

### 2.建立Stopwords

In [8]:
def getStopWords(datapath):
    stopwords=pd.read_csv(datapath,index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
    stopwords=stopwords["stopword"].values
    return stopwords

### 3.文本的处理

In [21]:
def preprocess_text(content_line,sentences,category,stopwords):
    for line in content_line:
        try:
            segs=jieba.lcut(line)    #利用结巴分词进行中文分词
            segs=filter(lambda x:len(x)>1,segs)    #去掉长度小于1的词
            segs=filter(lambda x:x not in stopwords,segs)    #去掉停用词
            sentences.append("__label__"+str(category)+" , "+" ".join(segs))    #把当前的文本和对应的类别拼接起来，组合成fasttext的文本格式
        except Exception as e:
            print (line)
            continue

### 4.将处理好的写入到文件中

In [24]:
def writeData(sentences,fileName):
    print("writing data to fasttext format...")
    writor= open(fileName,'w',encoding = 'utf-8')
    for sentence in sentences:
        writor.write(sentence+"\n")
    print("Writing Done!")

### 5.数据处理

In [26]:
def preprocessData(stopwords,saveDataFile):
    technology,car,entertainment,military,sports=loadData()    

    #去停用词，生成数据集
    sentences=[]
    preprocess_text(technology,sentences,cate_dic["technology"],stopwords)
    preprocess_text(car,sentences,cate_dic["car"],stopwords)
    preprocess_text(entertainment,sentences,cate_dic["entertainment"],stopwords)
    preprocess_text(military,sentences,cate_dic["military"],stopwords)
    preprocess_text(sports,sentences,cate_dic["sports"],stopwords)

    random.shuffle(sentences)    #做乱序处理，使得同类别的样本不至于扎堆
    writeData(sentences,saveDataFile)

### 6.主函数

In [27]:
stopwordsFile    = r"./data/stopwords_NLP.txt"
stopwords        = getStopWords(stopwordsFile)
saveDataFile     = r'./data/SavedData_Fasttext.txt'
preprocessData(stopwords,saveDataFile)

writing data to fasttext format...
Writing Done!


### fasttext做监督学习的

In [38]:
# https://fasttext.cc/blog/2019/06/25/blog-post.html#2-you-were-using-the-unofficial-fasttext-module%22
classifier  = fasttext.train_supervised(saveDataFile, lr=0.1, dim=100, epoch=5,word_ngrams=2, loss='softmax')
classifier.save_model("./data/fasttextmodel_file.bin")

In [39]:
result           = model.test(saveDataFile)
classifier.get_labels()

['__label__5', '__label__1', '__label__3', '__label__4', '__label__2']

In [53]:
### 预测一个样本，返回对应的几个可能Label及对应的可能的概率，k表示几个label,threshold 控制门限大于这个门限的才显示
text_string = ['北京 新能源 汽车 摇号 困难 交通 越来 拥挤 很多人 上班 都是 步行 骑车 共享 单车']
classifier.predict(text = text_string,k = 2,threshold = 0.1) # 输入是字符串 string 
#cate_dic = {'technology': 1, 'car': 2, 'entertainment': 3, 'military': 4, 'sports': 5}
# 上面希望显示两个结果，但是超过特定门限0.1的只有一个结果

([['__label__2']], array([[1.00000203]]))

In [46]:
classifier.test_label(saveDataFile,k =2,threshold = 0)

{'__label__1': {'f1score': 0.6054021151944,
  'precision': 0.43513070108279556,
  'recall': 0.9946},
 '__label__2': {'f1score': 0.6211984779388271,
  'precision': 0.4521735453315291,
  'recall': 0.9920215233324056},
 '__label__3': {'f1score': 0.630971941354904,
  'precision': 0.46122043606799706,
  'recall': 0.99845},
 '__label__4': {'f1score': 0.7385487729246035,
  'precision': 0.5884512212339404,
  'recall': 0.9914367269267365},
 '__label__5': {'f1score': 0.7370246683518146,
  'precision': 0.5840229246469195,
  'recall': 0.99865}}

In [None]:
## 实际预测
lable_to_cate={1:'technology',2:'car',3:'entertainment',4:'military',5:'sports'}
texts=['中新网 日电 2018 预赛 亚洲区 强赛 中国队 韩国队 较量 比赛 上半场 分钟 主场 作战 中国队 率先 打破 场上 僵局 利用 角球 机会 大宝 前点 攻门 得手 中国队 领先']
print(lables)
print(lable_to_cate[int(lables[0][0])])

#还可以得到类别+概率
lables=classifier.predict_proba(texts)
print(lables)

#还可以得到前k个类别
lables=classifier.predict(texts,k=3)
print(lables)

#还可以得到前k个类别+概率
lables=classifier.predict_proba(texts,k=3)
print(lables)