# Naive Bayes Classifier 

## 1) Traning set 불러오기

In [63]:
# training set, test set 만들기
import pandas as pd

data_set = pd.read_csv('./data/model/training_set.csv').dropna()
data_set = data_set[(data_set.lable==1.0) | (data_set.lable==-1.0)]

In [65]:
data_set.head()

Unnamed: 0,date,ngrams in doc,lable
1,2010-12-09,"재정/NNG;문제/NNG;지정학/NNG;위험/NNG,금리/NNG;시장/NNG;예상/...",1.0
3,2015-12-30,"경제/NNG;성장률/NNG;전망/NNG;하향/NNG,성장/NNG;하방/NNG;위험/...",-1.0
6,2009-11-04,"수출업체/NNG;네고/NNG;하락/NNG,시중/NNG;유동성/NNG;축소/NNG,유...",-1.0
7,2009-08-20,"불안정/NNG;안전자산/NNG;선호/NNG,채권시장/NNG;금리/NNG;고점/NNG...",1.0
8,2017-06-05,"fed/NNG;금리/NNG;인상/NNG;가능성/NNG;높/VA,글로벌/NNG;통화정...",1.0


## 2) Traning

In [32]:
# traning : traning set 으로 토큰별 빈도수 계산
from collections import defaultdict

def set_ngram_list(data_set) :
    # 토큰별로 문서내 빈도수 카운팅
    wordfreq = defaultdict(lambda : [0, 0])
    for i, row in data_set.iterrows():
        words = row['ngrams in doc'].replace('@@@',',').replace('.',',').split(',')

        # 문자열 길이가 1인것은 제외
        if len(words[0].split('/')[0]) == 1 :
            continue

        for word in words :
            if row['lable'] == 1 :
                wordfreq[word][1] += 1
            elif row['lable'] == -1 :
                wordfreq[word][0] += 1

        if i % 10000 == 0 :
            print("{} / {}".format(i, data_set.shape[0]))
            #break

    pd.DataFrame.from_dict(wordfreq, orient='index').to_csv('./data/model/ngram_list.csv')
    
set_ngram_list(data_set)

In [34]:
df = pd.read_csv('./data/model/ngram_list.csv')

In [43]:
df.shape

(139694, 3)

In [51]:
import math

# 예측 및 모델 검증
def predict(test_set, dic_df, sum_1, sum_0, intensity) :
    dic_df["polarity score"] = dic_df["w|pos"]/dic_df["w|neg"]
    dic_df["intensity"] = [x if x > 1 else 1/x for x in dic_df['polarity score']]
    dic_df = dic_df.loc[dic_df.intensity>intensity]

    dic = dic_df.to_dict('index')
    
    prob_pos_tot = math.log(sum_1/(sum_1+sum_0))
    prob_neg_tot = math.log(sum_0/(sum_1+sum_0))
    
    TP = FN = FP = TN = cnt = 0
    predict=[]

    for i, row in test_set.iterrows():
        ngrams = row['ngrams in doc'].split(',')
        lable = row['lable']
        prob_predict = prob_pos = prob_neg = 0

        tmp_token = ""
        tmp_pos = 0
        tmp_neg = 0
        try :
            for token in ngrams :
                if token == '' :
                    continue 
                
                if token in dic.keys() :
                    prob_pos += dic[token]['log(w|pos)']
                    prob_neg += dic[token]['log(w|neg)']
                    
            prob_pos = math.exp(prob_pos_tot + prob_pos)
            prob_neg = math.exp(prob_neg_tot + prob_neg)
        except Exception as e :
            print("===============")
                        
            print(str(e))
            print("prob_pos_tot : {:10.4f}".format(prob_pos_tot))
            print("prob_pos : {:10.4f}".format(prob_pos))
            print("prob_neg_tot : {:10.4f}".format(prob_neg_tot))
            print("prob_neg : {:10.4f}".format(prob_neg))
            print("sum_1 : {:10.4f}".format(sum_1))
            print("sum_0 : {:10.4f}".format(sum_0))
            print(ngrams)


        if prob_pos+prob_neg == 0 :
            prob_predict = 0
        else :
            prob_predict = prob_pos/(prob_pos+prob_neg)

        if prob_predict > 0.5 and lable == '1' :
            predict.append(1.0)
            TP += 1
        elif prob_predict > 0.5 and lable == '-1' :
            predict.append(1.0)
            FP += 1
        elif prob_predict < 0.5 and lable == '1' :
            predict.append(-1.0)
            FN += 1
        elif prob_predict < 0.5 and lable == '-1' :
            predict.append(-1.0)
            TN += 1
            
    print('TP={} / TN={} / FP={} / FN={}'.format(TP, TN, FP, FN))
    #ACC = (TP + TN)  / (전체 데이타 수 = P + N)
    ACC = (TP + TN) / (TP + TN + FP + FN) * 100
    # ERR = (FN+FP) / (전체 데이타수 = P+N)
    ERR = (FN+FP) / (TP + TN + FP + FN) * 100
    # SN = (TP) / P
    SN = (TP) / (TP + FN) * 100
    # PREC = TP / (TP+FP)
    PREC = TP / (TP+FP) * 100
    print("ACC={:7.3f}% / ERR = {:7.3f}% / SN = {:7.3f}% / PREC = {:7.3f}%".format(ACC, ERR, SN, PREC))

In [52]:
#dictionary 파일 저장
import pandas as pd
import numpy as np
from sklearn.utils import shuffle


def traning(data_set) :
    intensity = 1.3
    no_bagging = 1
    k = 0.5
    
    df = pd.read_csv('./data/model/ngram_list.csv', index_col="Unnamed: 0")
    df["freq"] =0
    df["w|pos"] =0
    df["w|neg"] =0

    for i in range(0,no_bagging) :
        print("========== bagging : " + str(i+1))
        data_set = shuffle(data_set)
        training_set = data_set[:int(len(data_set)*0.9)]
        test_set = data_set[int(len(data_set)*0.9):]
        
        sum_1 = df['1'].sum()#training_set.loc[training_set['lable']>0].shape[0]
        sum_0 = df['0'].sum()#training_set.loc[training_set['lable']<0].shape[0]

        df["w|pos"] += (df['1']+k)/(sum_1+k*2)
        df["w|neg"] += (df['0']+k)/(sum_0+k*2)
        df["log(w|pos)"] = np.log(df["w|pos"])
        df["log(w|neg)"] = np.log(df["w|neg"])
        
        predict(test_set, df, sum_1, sum_0, intensity)
 
    df["freq"] = df['0']+df['1'] # 전체 토큰별 빈도수 계산
    #df = df.loc[df.freq > 15] # 빈도가 15 초과인것만 필터링  
    df["w|pos"] = df["w|pos"]/no_bagging
    df["w|neg"] = df["w|neg"]/no_bagging        
    #df["log(w|pos)"] = np.log(df["w|pos"])
    #df["log(w|neg)"] = np.log(df["w|neg"])
    df["polarity score"] = df["w|pos"]/df["w|neg"]
    df["intensity"] = [x if x > 1 else 1/x for x in df['polarity score']]
    df["lable"] = [1 if x > 1 else -1 for x in df['polarity score']]
    df.loc[df.intensity>intensity].to_csv('./data/model/bok_dictionary.csv')
    
traning(data_set)

TP=6838 / TN=8730 / FP=2670 / FN=4129
ACC= 69.603% / ERR =  30.397% / SN =  62.351% / PREC =  71.918%


In [54]:
import pandas as pd
ff = pd.read_csv('./data/model/bok_dictionary.csv', index_col="Unnamed: 0")
tmp = ff[ff['lable']==1].sort_values('log(w|pos)', ascending=False )
tmp[tmp.index.str.contains(";")].head(30)

Unnamed: 0,0,1,freq,w|pos,w|neg,log(w|pos),log(w|neg),polarity score,intensity,label
금리/NNG;인상/NNG,10656,13276,23932,0.000628,0.000481,-7.372978,-7.639898,1.305936,1.305936,1
콜/NNG;금리/NNG;인상/NNG,771,1400,2171,6.6e-05,3.5e-05,-9.622144,-10.265487,1.90283,1.90283,1
물가/NNG;상승/NNG,1064,1323,2387,6.3e-05,4.8e-05,-9.678694,-9.943563,1.30326,1.30326,1
인플레이션/NNG;압력/NNG,853,1077,1930,5.1e-05,3.9e-05,-9.88433,-10.164478,1.323325,1.323325,1
ecb/NNG;금리/NNG;인상/NNG,659,1066,1725,5e-05,3e-05,-9.894592,-10.422341,1.695114,1.695114,1
지정학/NNG;위험/NNG,689,929,1618,4.4e-05,3.1e-05,-10.032082,-10.377857,1.413084,1.413084,1
인플레이션/NNG;억제/NNG,559,800,1359,3.8e-05,2.5e-05,-10.181493,-10.58678,1.499733,1.499733,1
물가/NNG;상승/NNG;압력/NNG,562,736,1298,3.5e-05,2.5e-05,-10.26482,-10.581432,1.37247,1.37247,1
서브프라임/NNG;부실/NNG,430,717,1147,3.4e-05,1.9e-05,-10.290956,-10.848876,1.747035,1.747035,1
수출/NNG;호조/NNG,440,695,1135,3.3e-05,2e-05,-10.322098,-10.825913,1.655023,1.655023,1


In [62]:
tmp[tmp.index=='경제/NNG;성장/NNG']

Unnamed: 0,0,1,freq,w|pos,w|neg,log(w|pos),log(w|neg),polarity score,intensity,label


In [57]:
tmp = ff[ff['lable']==-1].sort_values('log(w|neg)', ascending=False )
tmp[tmp.index.str.contains(";")].head(30)

Unnamed: 0,0,1,freq,w|pos,w|neg,log(w|pos),log(w|neg),polarity score,intensity,label
금리/NNG;인하/NNG,10227,5881,16108,0.000278,0.000462,-8.187162,-7.680988,0.602797,1.658932,-1
투자/NNG;플러스/NNG,3744,1737,5481,8.2e-05,0.000169,-9.406526,-8.68578,0.486389,2.055968,-1
경기/NNG;부양/NNG,3132,2179,5311,0.000103,0.000141,-9.179878,-8.864237,0.729321,1.371139,-1
금리/NNG;내리/VV,2692,1727,4419,8.2e-05,0.000121,-9.412298,-9.015598,0.672535,1.486911,-1
경기/NNG;침체/NNG,1892,1356,3248,6.4e-05,8.5e-05,-9.654066,-9.368169,0.75134,1.330955,-1
금리/NNG;낮/VV,1380,974,2354,4.6e-05,6.2e-05,-9.984805,-9.683622,0.739943,1.351456,-1
금융시장/NNG;불안/NNG,1046,735,1781,3.5e-05,4.7e-05,-10.266179,-9.960617,0.736709,1.357387,-1
금융시장/NNG;안정/NNG,1020,703,1723,3.3e-05,4.6e-05,-10.310661,-9.985775,0.72261,1.383873,-1
인하/NNG;효과/NNG,1015,413,1428,2e-05,4.6e-05,-10.842072,-9.990687,0.426824,2.342889,-1
ecb/NNG;금리/NNG;인하/NNG,913,553,1466,2.6e-05,4.1e-05,-10.550467,-10.09654,0.635129,1.574484,-1


## 3) prediction & 검증

In [102]:
def r_squared(y_true, y_hat):
    ssr = 0
    sst = 0
    e = np.subtract(y_true, y_hat)
    y_mean = np.mean(y_true)
    for item in e:
        ssr += item**2
    for item in y_true:
        sst += (item - y_mean)**2
    r2 = 1 - ssr / sst
    return r2

In [103]:
#r_squared(lable_ls, predict)