In [33]:
from konlpy.tag import Twitter
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import StratifiedKFold, ShuffleSplit, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

### False negative, False positive 샘플 검사

In [2]:
def cv_input(frozen_dir='./private/files/seodam_together_notags0326.csv', unfrozen_dir='./private/files/unfrozen_mixed0402.csv', row_limit=3211):
    df_frozen = pd.read_csv(frozen_dir).drop(['Unnamed: 0'], axis=1)
    df_unfrozen = pd.read_csv(unfrozen_dir).drop(['Unnamed: 0'], axis=1)[:row_limit]

    unfrozen = np.array(df_unfrozen['text2'])
    frozen = np.array(df_frozen['text'])

    weight0 = np.append(np.array(np.ones(row_limit, dtype=int)), np.array(df_frozen['freeze']))
    seodam_x = np.append(unfrozen, frozen)
    seodam_y = np.append(np.zeros(row_limit, dtype=int), np.ones(row_limit, dtype=int))
    return (seodam_x, seodam_y, weight0)

def tokenize_filtered(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [3]:
class FalseSamples(object):
    def __init__(self, X, y, w, tokenize=tokenize_filtered, weight=False, stop_words=None, random_state=0):
        self.X = X
        self.y = y
        self.w = w
        self.tokenize = tokenize
        self.weight = weight
        self.stop_words = stop_words
        self.random_state = random_state

    def simple_split(self):
        X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(self.X, self.y, self.w, test_size=0.1, random_state=self.random_state)
        return X_train, X_test, y_train, y_test, w_train        

In [4]:
X, y, weight = cv_input()
X_train0, X_test0, y_train, y_test, w_train, w_test = train_test_split(X, y, weight, test_size=0.1)

vect = CountVectorizer(tokenizer=tokenize_filtered)
X_train = vect.fit_transform(X_train0)
X_test = vect.transform(X_test0)

clf = MultinomialNB()
clf.fit(X_train, y_train, sample_weight=w_train)
result = clf.predict(X_test)

mask_fn = np.logical_and(y_test==0, result==1)
mask_fp = np.logical_and(y_test==1, result==0)

#false_negative = [text.decode('utf-8') for text in X_test[mask_fn]]
#false_positive = [text.decode('utf-8') for text in X_test[mask_fp]]

report = confusion_matrix(y_test, result)
recall_rate = float(report[1,1]) / (report[1,0] + report[1,1])

In [10]:
df_X = pd.DataFrame(X_test0, columns=['text'])

실제 1(냉동)인데 0(정상)으로 예측

In [13]:
df_X.ix[mask_fp]

Unnamed: 0,text
21,결혼을 누구랑 하느냐는 팔자인듯 내가 사주랑 손금 관심이 있는데..(전에여기서 손금...
34,내가 결혼하기전에 꼬옥! 달성할 목표 세다리 !?양다리는 두 번정도 해봤음 삼다리만...
50,너희들의 인생치킨집(신촌) 알려줘 여친님께서 치킨이 고프다고하셔데이트때 맛난데로 가...
57,경제로볼걸ㅅㅂ 경영문제낸ㅅㄲ죽이고싶다
60,[대외교류처] 업무보조 근로장학생 모집 대외교류처(발전홍보팀)에서 업무보조학생 O...
87,경상도는 나라를 팔아도 새누리전라도는 같이 팔아도 국민의당
103,이거 비대위장 맞냐? *많이 먹어 비대해진 위장이다
107,근데 자연대 학점 2점 초반대 맞은 사람이 기술고시 변리사 한다고 해서 잘될거라고는...
110,일본의 13살 vs 20살 오른쪽이 20살 ㄷㄷ
112,동성애 다괜찬은데 모텔에서 샤워기뚜껑빼고 그걸로 관장안햇으면 좋겟다 다음으로 들어간...


실제 0(정상)인데 1(냉동)으로 예측

In [None]:
df_X.ix[mask_fn]

### 클래스별 중요한 단어 찾기

In [21]:
important_0 = []
for i, prob in enumerate(clf.feature_log_prob_[0]):
    if prob > -7:
        important_0.append(i)

In [22]:
important_1 = []
for i, prob in enumerate(clf.feature_log_prob_[1]):
    if prob > -7:
        important_1.append(i)

In [23]:
voca = sorted(vect.vocabulary_.items(), key=lambda x : x[1])

0(정상)인 글에 등장할 확률이 높은 단어

In [24]:
xx = []
for i in important_0:
    if i not in important_1:
        xx.append(voca[i])
pprint(xx)

[(4/Number, 226),
 (5/Number, 255),
 (a/Alpha, 369),
 (and/Alpha, 439),
 (of/Alpha, 1520),
 (our/Alpha, 1557),
 (that/Alpha, 2062),
 (the/Alpha, 2063),
 (to/Alpha, 2096),
 (we/Alpha, 2220),
 (’/Foreign, 2324),
 (ㅎㅎ/KoreanParticle, 2517),
 (ㅠㅠ/KoreanParticle, 2547),
 (공부/Noun, 3636),
 (교수/Noun, 3831),
 (면접/Noun, 7136),
 (시간/Noun, 10375),
 (제/Noun, 14194),
 (준비/Noun, 14591),
 (지원/Noun, 14839),
 (친구/Noun, 15734),
 (학기/Noun, 17026)]


1(냉동)인 글에 등장할 확률이 높은 단어

In [26]:
yy = []
for i in important_1:
    if i not in important_0:
        yy.append(voca[i])
pprint(yy)

[(가지/Noun, 2708),
 (게/Noun, 3238),
 (경우/Noun, 3348),
 (그리고/Conjunction, 4195),
 (나가다/Verb, 4762),
 (남양주/Noun, 4932),
 (남자/Noun, 4936),
 (내다/Verb, 4994),
 (니/Noun, 5377),
 (대통령/Noun, 5763),
 (대한/Noun, 5787),
 (댓글/Noun, 5806),
 (돈/Noun, 5979),
 (돼다/Verb, 6098),
 (되어다/Verb, 6108),
 (따다/Verb, 6305),
 (또/Noun, 6383),
 (못/Noun, 7317),
 (박근혜/Noun, 7762),
 (병신/Noun, 8277),
 (보이다/Verb, 8337),
 (분들/Suffix, 8600),
 (뽑다/Verb, 9016),
 (사실/Noun, 9129),
 (사회/Noun, 9206),
 (새끼/Noun, 9369),
 (소리/Noun, 9871),
 (수준/Noun, 10120),
 (시키다/Verb, 10464),
 (쓸다/Verb, 10785),
 (애/Noun, 11152),
 (얘기/Noun, 11313),
 (여기/Noun, 11634),
 (여성/Noun, 11667),
 (여자/Noun, 11686),
 (올리다/Verb, 12118),
 (와/Noun, 12136),
 (외교관/Noun, 12210),
 (위/Noun, 12526),
 (이사회/Noun, 12980),
 (이유/Noun, 13035),
 (자기/Noun, 13460),
 (잘못/Noun, 13624),
 (존나/Noun, 14371),
 (주다/Verb, 14498),
 (지다/Verb, 14775),
 (직원/Noun, 14900),
 (취업/Noun, 15662),
 (학생/Noun, 17044),
 (한국/Noun, 17081),
 (함/Noun, 17172)]


### 냉동 단어들 vs 비 냉동 단어들 비교

In [43]:
def tokenize_filtered(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' and t[1] != 'Alpha' :
            token_list.append('/'.join(t))
    return token_list

In [44]:
df_frozen = pd.read_csv('./private/files/seodam_together_notags0322.csv').drop(['Unnamed: 0'], axis=1)
df_unfrozen = pd.read_csv('./private/files/unfrozen_mixed0402.csv').drop(['Unnamed: 0'], axis=1)

In [45]:
frozen_str = ' '.join(df_frozen.text.values).decode('utf-8')
unfrozen_str = ' '.join(df_unfrozen.text2.values).decode('utf-8')

In [46]:
frozen_words = tokenize_filtered(frozen_str)
unfrozen_words = tokenize_filtered(unfrozen_str)

In [47]:
frozen_count = Counter(frozen_words)
unfrozen_count = Counter(unfrozen_words)

In [48]:
diff_frozen = {}
for k, v in frozen_count.items():
    if k in unfrozen_words:
        diff_frozen[k] = v - unfrozen_count[k]
    else :
        diff_frozen[k] = v

diff_unfrozen = {}
for k, v in unfrozen_count.items():
    if k in frozen_words:
        diff_unfrozen[k] = v - frozen_count[k]
    else:
        diff_unfrozen[k] = v

In [49]:
frozen_sort = sorted(diff_frozen.items(), key=lambda x: x[1], reverse=True)
unfrozen_sort = sorted(diff_unfrozen.items(), key=lambda x: x[1], reverse=True)

워드클라우드 생성

In [53]:
frozen_lst = []
for pos in frozen_sort[:30]:
    removed = re.sub(r'/\w+', '', pos[0])
    tup = (removed, pos[1])
    frozen_lst.append(tup)

unfrozen_lst = []
for pos in unfrozen_sort[:30]:
    removed = re.sub(r'/\w+', '', pos[0])
    tup = (removed, pos[1])
    unfrozen_lst.append(tup)

In [80]:
import pytagcloud

tag_frozen = frozen_lst
tag_unfrozen = unfrozen_lst

taglist = pytagcloud.make_tags(tag_frozen, maxsize=150)
taglist2 = pytagcloud.make_tags(tag_unfrozen, maxsize=100)

pytagcloud.create_tag_image(taglist, 'wordcloud_f.jpg', fontname='Korean', size=(500, 320), layout=4)

In [76]:
pytagcloud.create_tag_image(taglist2, 'wordcloud_u.jpg', fontname='Korean', size=(500, 320), layout=4)

### Stop words 파일 생성

In [36]:
frozen = np.array(df_frozen['text'])
frozen_str = ' '.join(frozen).decode('utf-8')
unfrozen = np.array(df_unfrozen['text2'])
unfrozen_str = ' '.join(unfrozen).decode('utf-8')

In [38]:
words = tokenize_basic(frozen_str)
counter = Counter(words)

In [39]:
stop_words = []
for k, v in counter.most_common(100):
    stop_words.append(k)

In [71]:
with open('stopwords.txt', 'w') as words :
    words.write(','.join(stop_words).encode('utf-8'))