# 載入一些套件與基本宣告

In [None]:
%pylab inline
import time
import re
import matplotlib.pyplot as plt
import pandas as pd
import string
import codecs
import os
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from sklearn import naive_bayes as bayes
from sklearn.model_selection import train_test_split
#簡轉繁
from hanziconv import HanziConv

#ham與spam的每封郵件list
ham_text = []
spam_text = []
#ham與spam的數據集
ham_data = []
spam_data = []
#取得目前工作目錄
SaveDirectory = os.getcwd() 
#獲得文檔列表
listham = os.listdir(SaveDirectory+'\\data\\normal')
listspam = os.listdir(SaveDirectory+'\\data\\spam')
#數據集數量

print('spam total：',len(listspam))
print('ham total：',len(listham))


Populating the interactive namespace from numpy and matplotlib
spam total： 7775
ham total： 7063


# 將ham文本資料轉成繁體並存成DATAFrame

In [None]:
for fileName in listham:
        with open('data/normal/'+fileName, 'r',encoding='gbk') as f:
            #文本每一列儲存
            text=[]
            # 過濾非中文字符，正規化
            for line in f.readlines():
                pattern = re.compile('[^\u4e00-\u9fa5]') #正規化(去除非中文字符)
                line = pattern.sub("", line) #將其他字符取代為""
                content = line.strip().split() #去除一些空白.換行
                text = text+content #儲存這個文本的內容
            text = " ".join(text) #將文本內容list加入全部文本的list
            ham_text.append(HanziConv.toTraditional(text)) #簡轉繁
ham_data = pd.DataFrame(ham_text,columns=['text']) #將list改成dataframe
ham_data['label']='0' #將資料類別加上 0:ham 1:spam
ham_data.head()


# 將spam文本資料轉成繁體並存成DATAFrame

In [None]:
for fileName in listspam:
        with open('data/spam/'+fileName, 'r',encoding='gbk') as f:
            text=[]
            # 过滤掉非中文字符
            for line in f.readlines():
                pattern = re.compile('[^\u4e00-\u9fa5]')
                line = pattern.sub("", line)
                content = line.strip().split()
                text = text+content
            text = " ".join(text)
            spam_text.append(HanziConv.toTraditional(text))
spam_data = pd.DataFrame(spam_text,columns=['text'])
spam_data['label']='1'
spam_data.head()

# 將ham與spam組合並打亂

In [None]:
all_data = pd.concat([spam_data,ham_data],axis=0, ignore_index=True)  #將ham lsit與spam list串起來
all_data = all_data.sample(frac=1).reset_index(drop=True) #sample frac = 1 是把數據都打亂 resr_index是把index也重設
print('data shape：',all_data.shape)
print('spams in rows：',all_data.loc[all_data['label']=="1"].shape[0])
print('hams in rows：',all_data.loc[all_data['label']=="0"].shape[0])
all_data.head(6)

# 載入停用詞

In [None]:
#載入 停用詞 (stopwords) 自動過濾掉某些字或詞
stopwords = codecs.open(os.path.join(SaveDirectory+'\data', 'stopwords_tr.txt'), 'r', 'utf-8').read().split('\r\n')

# 將文本利用Jieba斷詞、過濾停用詞

In [None]:
processed_texts = []
for text in all_data["text"]:
    words = []
    seg_list = jieba.cut(text) #利用Jieba斷詞
    for seg in seg_list: 
        # isalpha()檢測詞是否由字符組成，且不在停用詞的list中
        if (seg.isalpha()) & (seg not in stopwords):
            words.append(seg)
    sentence = " ".join(words)
    processed_texts.append(sentence)
all_data["text"] = processed_texts #利用過濾且斷好的詞取代文本
all_data.head(6)

# 3. 計算HAM和SPAM 的 TF-IDF差異 DIFF
## 值越大的代表他在SPAM的可能性更大
「size_table」: 要選多少個重要的「詞」出來，等於決定特徵向量的維度數。Default:我設成200。 「ignore」: 字少於幾個以下就不要算

In [None]:
import re
def generate_key_list(all_data, size_table=200,ignore=2):
    dict_spam_raw = dict()
    dict_genuine_raw = dict()
    dict_IDF = dict()

    # 去除字母外的所有內容.
    for i in range(all_data.shape[0]):
        finds = all_data.iloc[i].text.split(" ")
        if all_data.iloc[i].label == '1':
            for find in finds:
                if len(find)<ignore: continue
                try:
                    dict_spam_raw[find] = dict_spam_raw[find] + 1
                except:	
                    dict_spam_raw[find] = dict_spam_raw.get(find,1) #若是新的文字因為後面get找不到這個索引會返回逗點後面的值
                    dict_genuine_raw[find] = dict_genuine_raw.get(find,0)
        else:
            for find in finds:
                if len(find)<ignore: continue
                try:
                    dict_genuine_raw[find] = dict_genuine_raw[find] + 1
                except:	
                    dict_genuine_raw[find] = dict_genuine_raw.get(find,1)
                    dict_spam_raw[find] = dict_spam_raw.get(find,0)

        word_set = set()
        for find in finds:
            if not(find in word_set):
                if len(find)<ignore: continue
                try:
                    dict_IDF[find] = dict_IDF[find] + 1
                except:	
                    dict_IDF[find] = dict_IDF.get(find,1)
            word_set.add(find)
    word_df = pd.DataFrame(list(zip(dict_genuine_raw.keys(),dict_genuine_raw.values(),dict_spam_raw.values(),dict_IDF.values())))
    word_df.columns = ['keyword','genuine','spam','IDF']
    word_df['genuine'] = word_df['genuine'].astype('float')/all_data[all_data['label']=='0'].shape[0]
    word_df['spam'] = word_df['spam'].astype('float')/all_data[all_data['label']=='1'].shape[0]
    word_df['IDF'] = np.log10(word_df.shape[0]/word_df['IDF'].astype('float'))
    word_df['genuine_IDF'] = word_df['genuine']*word_df['IDF']
    word_df['spam_IDF'] = word_df['spam']*word_df['IDF']
    word_df['diff']=word_df['spam_IDF']-word_df['genuine_IDF']
    selected_spam_key = word_df.sort_values('diff',ascending=False)  
    keyword_dict = dict()
    i = 0
    for word in selected_spam_key.head(size_table).keyword:
        keyword_dict.update({word.strip():i})
        i+=1
    return keyword_dict   
# build a tabu list based on the training data
size_table = 200               # 多少特徵維度去分類SPAM
word_len_ignored = 2            # 忽略那些比這個還要小的字詞
keyword_dict=generate_key_list(all_data, size_table, word_len_ignored)

In [None]:
#排序越前面的代表出現在SPAM的機率越大
for key,value in keyword_dict.items():
    print('{key}:{value}'.format(key = key, value = value))

# 將原本斷詞好的數據轉用TFIDF所過濾出來的詞轉換成稀疏矩陣

In [None]:
def convert_Content(text, keyword_dict):
    #判斷是否有此特徵
    m = len(keyword_dict) #維度數量
    res = np.int_(np.zeros(m)) #建置一個幾維度的向量
    finds = text.split(" ") #將文本內容切割(類似中文斷詞)
    for find in finds:
        try:
            #若比對完有此特徵則特徵改為1
            i = keyword_dict[find] 
            res[i]=1
        except:
            continue
    return res
def raw2feature(all_data,keyword_dict):
    n_all_data = all_data.shape[0]
    m = len(keyword_dict)
    X_all_data = np.zeros((n_all_data,m));
    Y_all_data = np.int_(all_data.label=='1')
    for i in range(n_all_data):
        X_all_data[i,:] = convert_Content(all_data.iloc[i].text, keyword_dict)
    return [X_all_data,Y_all_data]
     
all_data_matrix=raw2feature(all_data,keyword_dict)


# 利用交叉驗證方式將數據切成訓練集與測試集

In [None]:
#traindate & testdata
train_set, test_set, trainlabel, testlabel = train_test_split(all_data_matrix[0], all_data_matrix[1], test_size = 0.15)
train_set

In [None]:
print("train:"+str(len(train_set)))
print("test:"+str(len(test_set)))
print(sum(trainlabel))
print(sum(testlabel))

# 下面是傳統監督式NB.RF的方法

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
def learn(train,trainlabel):
    model_NB = GaussianNB(priors=None)
    model_NB.fit(train_set, trainlabel)
    Y_hat_NB = model_NB.predict(train_set)

    model_RF = RandomForestClassifier(n_estimators=200, max_depth=None,\
                                 min_samples_split=2, random_state=0)
    model_RF.fit(train_set, trainlabel)
    Y_hat_RF = model_RF.predict(train_set)
    
    n=np.size(trainlabel)
    print('Training Accuarcy NBclassifier : {:.2f}％'.format(sum(np.int_(Y_hat_NB==trainlabel))*100./n))
    print('Training Accuarcy RF: {:.2f}％'.format(sum(np.int_(Y_hat_RF==trainlabel))*100./n))
    return model_NB,model_RF
# train the Random Forest and the Naive Bayes Model using training data
model_NB,model_RF=learn(train_set,trainlabel)

In [None]:
def test_func(test,testlabel,model):
    Y_hat = model.predict(test)
    n=np.size(testlabel)
    print ('Testing Accuarcy: {:.2f}％ ({})'.format(sum(np.int_(Y_hat==testlabel))*100./n,model.__module__))
# Test Model using testing data
test_func(test_set,testlabel,model_NB)
test_func(test_set,testlabel,model_RF)


In [None]:
import matplotlib.pyplot as plt

x = [10386, 11128, 11870, 12612,13354]
y = [0.8949, 0.8986, 0.8962, 0.9106,0.9329]
y1 = [0.9391, 0.9496, 0.9626, 0.9668,0.9636]
plt.xlim(10000, 14000)
plt.ylim(0.8, 0.99)
plt.plot(x, y, marker='.', mec='b', mfc='w' )
plt.plot(x, y1, marker='.', mec='b', mfc='w' )
plt.legend([
    'NB classifier', 
    'RF classifier'
])  # 让图例生效
# plt.xticks(x, names, rotation=45)
plt.margins(0)
plt.subplots_adjust(bottom=0.15)
plt.xlabel(u"Train data") #X轴标签
plt.ylabel("Accuracy") #Y轴标签
plt.title("Performance of the two approaches") #标题
plt.savefig("classifier ")
plt.show()

# 下面是測試PU

In [None]:
import pandas as pd #數據處理
import numpy as np  #隨機取數
import matplotlib.pyplot as plt   #繪圖
#把繪圖套入ipython中的魔法函數%
%matplotlib inline  
plt.rcParams['figure.figsize'] = 7,7   # 圖片大小
plt.rcParams['font.size'] = 14         # 圖片文字大小

## 載入上面數據集

In [None]:
trainlabel_ = trainlabel.tolist()
trainlabel_ = pd.Series(trainlabel_)
train_set_ = train_set.tolist()
train_set_ = pd.DataFrame(train_set_)
sum(trainlabel_)


## 隱藏9成的數據集

In [None]:
# 保留原本的標籤，之後比對使用
trainlabel_orig = trainlabel.copy()

# 取消一些標記的數據
hidden_size = 5936
#loc可以選擇數據(把上面數量的POSTIVE DATA隱藏為UNLABELED DATA)
trainlabel_.loc[
    np.random.choice(
        trainlabel_[trainlabel_ == 1].index, 
        replace = False, 
        size = hidden_size
    )
] = 0

## 使用基本的監督式隨機森林法來計算概率

In [None]:
# 使用一般的隨機森林算法
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    n_estimators = 1000,  # 使用1000顆決策樹
    n_jobs = -1           # 使用全部的CPU合運算
)
rf.fit(train_set, trainlabel_)

# 儲存次方法給定的分數
results = pd.DataFrame({
    'truth'      : trainlabel_orig,   # 真實標籤
    'label'      : trainlabel_,        # 預測標籤
    'output_std' : rf.predict_proba(train_set)[:,1],   # 隨機森林分數(求屬於標籤1的概率)
}, columns = ['truth', 'label', 'output_std'])
# print(results)

## PU-Bagging
### 1.他是透過將正實例和未標記實例結合創建數據集，並進行替換
### 2.將正數視為正實例，負數視為未標記實例
### 3.將分類器應用於隨機樣本中未包含的人合未標記數據點，稱為OOB(out of bag)，並記錄其分數
### 重複上面三個步驟，最後每個點OOB分數為每次分配分數的平均值

#### 原本是決策樹但校果不好改隨機森林

In [None]:
X = train_set_
y = trainlabel_
# 使用1000棵決策樹
# from sklearn.tree import DecisionTreeClassifier
nestimators = 1000
# estimator = DecisionTreeClassifier()
from sklearn.ensemble import RandomForestClassifier
estimator = RandomForestClassifier(
    n_estimators = 1000,  # 使用1000顆決策樹
    n_jobs = -1           # 使用全部的CPU合運算
)
# 追蹤正實例和未標記實例的索引
iP = y[y > 0].index
iU = y[y <= 0].index

# 對每個數據點記錄他是OOB的次數
num_oob = pd.DataFrame(np.zeros(shape = y.shape), index = y.index)

# 記錄OOB分數的總合
sum_oob = pd.DataFrame(np.zeros(shape = y.shape), index = y.index)

for _ in range(nestimators):
    # 獲取本輪未標記點的 bootstrap sample(引導樣本)
    ib = np.random.choice(iU, replace=True, size = len(iP))

    # 找到本輪的OOB數據
    i_oob = list(set(iU) - set(ib))

    # 獲取訓練數據 (所有正實例和 bootstrap sample(未標記實例)還有建立樹
    Xb = X[y > 0].append(X.loc[ib])
    yb = y[y > 0].append(y.loc[ib])
    estimator.fit(Xb, yb)
    
    # 紀錄本輪OOB分數還有OOB次數
    sum_oob.loc[i_oob, 0] += estimator.predict_proba(X.loc[i_oob])[:,1]
    num_oob.loc[i_oob, 0] += 1

# 最後儲存平均的OOB分數
results['output_bag'] = sum_oob / num_oob
print(results)

In [None]:
X.loc[ib].shape

In [None]:
ts = range(3936, hidden_size, 100)
y_std, y_bag, y_skb, y_stp, y_all = [], [], [], [], []
for t in ts:
    y_std.append(
        results[results.label == 0].sort_values(
            'output_std', ascending = False
        ).head(t).truth.mean()
    )
    y_bag.append(
        results[results.label == 0].sort_values(
            'output_bag', ascending = False
        ).head(t).truth.mean()
    )
#     y_skb.append(
#         results[results.label == 0].sort_values(
#             'output_skb', ascending = False
#         ).head(t).truth.mean()
#     )
#     y_stp.append(
#         results[results.label == 0].sort_values(
#             'output_stp', ascending = False
#         ).head(t).truth.mean()
#     )
#     y_all.append(
#         results[results.label == 0].sort_values(
#             'output_all', ascending = False
#         ).head(t).truth.mean()
#     )

# 提取RN跟P在訓練一個新分類器

In [None]:
positiveX = results[results.label == 0].sort_values(
            'output_std', ascending = False
        ).index
trainlabel_.loc[positiveX[:5936]] = 1
from sklearn.ensemble import RandomForestClassifier
purf = RandomForestClassifier(
    n_estimators = 1000,  # 使用1000顆決策樹
    n_jobs = -1           # 使用全部的CPU合運算
)
purf.fit(train_set, trainlabel_)
trainlabel_.loc[positiveX[:5936]] = 0

In [None]:
positiveXpos = results[results.label == 1].index
positiveXneg = results[results.label == 0].sort_values('output_std', ascending = True).index
positiveXpos = trainlabel_.loc[positiveXpos[:]]
positiveXneg = trainlabel_.loc[positiveXneg[:653]]
positiveXlabel = positiveXpos+positiveXneg

from sklearn.ensemble import RandomForestClassifier
purfposneg = RandomForestClassifier(
    n_estimators = 1000,  # 使用1000顆決策樹
    n_jobs = -1           # 使用全部的CPU合運算
)
purfposneg.fit(train_set[positiveXlabel.index], trainlabel_[positiveXlabel.index])

# --------------------------------------------------

In [None]:
positiveXpu = results[results.label == 0].sort_values('output_bag', ascending = False).index
trainlabel_.loc[positiveXpu[:5936]] = 1
from sklearn.ensemble import RandomForestClassifier
pubaggingrf = RandomForestClassifier(
    n_estimators = 1000,  # 使用1000顆決策樹
    n_jobs = -1           # 使用全部的CPU合運算
)
pubaggingrf.fit(train_set, trainlabel_)
trainlabel_.loc[positiveXpu[:5936]] = 0

In [None]:
# 效能圖
plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = 15, 8

plt.plot(
    ts, y_std,
    ts, y_bag,
    #ts, y_stp,
    #ts, y_all,
    lw = 5
)

vals = plt.gca().get_yticks()
plt.yticks(vals, ['%.0f%%' % (v*100) for v in vals])
plt.xlabel('Number of unlabeled data points chosen from the top rated')
plt.ylabel('Percent of chosen that are secretly positive')
plt.legend([
    'Standard classifier',
    'PU bagging',
])
ylim = plt.gca().get_ylim()
plt.title('Performance of the three approaches and of their average')
plt.grid()
plt.show()

# 下面是實驗

In [None]:
def test_func(test,testlabel,model):
    Y_hat = model.predict(test)
    n=np.size(testlabel)
    print ('Testing Accuarcy: {:.2f}％ ({})'.format(sum(np.int_(Y_hat==testlabel))*100./n,model.__module__))
# Test Model using testing data
test_func(test_set,testlabel,rf) #傳統監督式
test_func(test_set,testlabel,purf) #PU提出後的改良版
test_func(test_set,testlabel,pubaggingrf)
test_func(test_set,testlabel,purfposneg)

In [None]:
ts = range(100, hidden_size, 200)
y_std = []
for t in ts:
    y_std.append(
        results[results.label == 0].sort_values(
            'output_std', ascending = False
        ).head(t).truth.mean()
    )

# 以下是將測試集導入

In [None]:
test_text = []
testspam_text = []
testham_text = []
test_data = []
listtest = os.listdir(SaveDirectory+'\\data\\test')

In [None]:
for fileName in listtest:
    if int(fileName)>1000:
        with open('data/test/'+fileName, 'r',encoding='gbk') as f:
            #文本每一列儲存
            text=[]
            # 過濾非中文字符，正規化
            for line in f.readlines():
                pattern = re.compile('[^\u4e00-\u9fa5]') #正規化(去除非中文字符)
                line = pattern.sub("", line) #將其他字符取代為""
                content = line.strip().split() #去除一些空白.換行
                text = text+content #儲存這個文本的內容
            text = " ".join(text) #將文本內容list加入全部文本的list
            testspam_text.append(HanziConv.toTraditional(text)) #簡轉繁
    else:
        with open('data/test/'+fileName, 'r',encoding='gbk') as f:
            #文本每一列儲存
            text=[]
            # 過濾非中文字符，正規化
            for line in f.readlines():
                pattern = re.compile('[^\u4e00-\u9fa5]') #正規化(去除非中文字符)
                line = pattern.sub("", line) #將其他字符取代為""
                content = line.strip().split() #去除一些空白.換行
                text = text+content #儲存這個文本的內容
            text = " ".join(text) #將文本內容list加入全部文本的list
            testham_text.append(HanziConv.toTraditional(text)) #簡轉繁
        
            
testspam_data = pd.DataFrame(testspam_text,columns=['text']) #將list改成dataframe
testham_data = pd.DataFrame(testham_text,columns=['text']) #將list改成dataframe
testspam_data['label']='1' #將資料類別加上 0:ham 1:spam
testham_data['label']='0'
test_data = pd.concat([testspam_data,testham_data])
test_data = test_data.sample(frac=1).reset_index(drop=True)
test_data.head()


In [None]:
processed_texts = []
for text in test_data["text"]:
    words = []
    seg_list = jieba.cut(text) #利用Jieba斷詞
    for seg in seg_list: 
        # isalpha()檢測詞是否由字符組成，且不在停用詞的list中
        if (seg.isalpha()) & (seg not in stopwords):
            words.append(seg)
    sentence = " ".join(words)
    processed_texts.append(sentence)
test_data["text"] = processed_texts #利用過濾且斷好的詞取代文本
test_data.head(6)

In [None]:
test_data_matrix=raw2feature(test_data,keyword_dict)
test_testset = test_data_matrix[0]
test_testlabel = test_data_matrix[1]

In [None]:
def test_func(test,testlabel,model):
    Y_hat = model.predict(test)
    n=np.size(testlabel)
    print ('Testing Accuarcy: {:.2f}％ ({})'.format(sum(np.int_(Y_hat==testlabel))*100./n,model.__module__))
# Test Model using testing data
test_func(test_testset,test_testlabel,rf)
test_func(test_testset,test_testlabel,model_RF)
test_func(test_testset,test_testlabel,pubaggingrf)
test_func(test_testset,test_testlabel,purfposneg)