In [7]:
from konlpy.tag import Twitter
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import StratifiedKFold, ShuffleSplit, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix



In [2]:
df_frozen = pd.read_csv('./private/files/seodam_together_notags0322.csv').drop(['Unnamed: 0'], axis=1)
df_unfrozen = pd.read_csv('./private/files/unfrozen_mixed0402.csv').drop(['Unnamed: 0'], axis=1)

### False negative, False positive 샘플 검사

In [1]:
def cv_input(frozen_dir='./private/files/seodam_together_notags0326.csv', unfrozen_dir='./private/files/unfrozen_mixed0402.csv', row_limit=3211):
    df_frozen = pd.read_csv(frozen_dir).drop(['Unnamed: 0'], axis=1)
    df_unfrozen = pd.read_csv(unfrozen_dir).drop(['Unnamed: 0'], axis=1)[:row_limit]

    unfrozen = np.array(df_unfrozen['text2'])
    frozen = np.array(df_frozen['text'])

    weight0 = np.append(np.array(np.ones(row_limit, dtype=int)), np.array(df_frozen['freeze']))
    seodam_x = np.append(unfrozen, frozen)
    seodam_y = np.append(np.zeros(row_limit, dtype=int), np.ones(row_limit, dtype=int))
    return (seodam_x, seodam_y, weight0)

def tokenize_filtered(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [14]:
class FalseSamples(object):
    def __init__(self, X, y, w, tokenize=tokenize_filtered, weight=False, stop_words=None, random_state=0):
        self.X = X
        self.y = y
        self.w = w
        self.tokenize = tokenize
        self.weight = weight
        self.stop_words = stop_words
        self.random_state = random_state

    def simple_split(self):
        X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(self.X, self.y, self.w, test_size=0.1, random_state=self.random_state)
        return X_train, X_test, y_train, y_test, w_train        

In [40]:
X, y, weight = cv_input()
X_train0, X_test0, y_train, y_test, w_train, w_test = train_test_split(X, y, weight, test_size=0.1)

vect = CountVectorizer(tokenizer=tokenize_filtered)
X_train = vect.fit_transform(X_train0)
X_test = vect.transform(X_test0)

clf = MultinomialNB()
clf.fit(X_train, y_train, sample_weight=w_train)
result = clf.predict(X_test)

mask_fn = np.logical_and(y_test==0, result==1)
mask_fp = np.logical_and(y_test==1, result==0)

#false_negative = [text.decode('utf-8') for text in X_test[mask_fn]]
#false_positive = [text.decode('utf-8') for text in X_test[mask_fp]]

report = confusion_matrix(y_test, result)
recall_rate = float(report[1,1]) / (report[1,0] + report[1,1])

In [31]:
df_X = pd.DataFrame(X_test, columns=['text'])

실제 1(냉동)인데 0(정상)으로 예측

In [None]:
df_X.ix[mask_fp]

실제 0(정상)인데 1(냉동)으로 예측

In [None]:
df_X.ix[mask_fn]

In [68]:
important_0 = []
for i, prob in enumerate(clf.feature_log_prob_[0]):
    if prob > -6:
        important_0.append(i)

In [80]:
important_1 = []
for i, prob in enumerate(clf.feature_log_prob_[1]):
    if prob > -6:
        important_1.append(i)

In [74]:
voca = sorted(vect.vocabulary_.items(), key=lambda x : x[1])

0(정상)인 글에 등장할 확률이 높은 단어

In [77]:
xx = []
for i in important_0:
    xx.append(voca[i])
pprint(xx)

1(냉동)인 글에 등장할 확률이 높은 단어

In [None]:
yy = []
for i in important_1:
    yy.append(voca[i])
pprint(yy)

### 냉동 단어들 vs 비 냉동 단어들 비교

In [3]:
def tokenize_basic(doc):
    pos_tagger = Twitter()
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

def tokenize_noun(doc):
    pos_tagger = Twitter()
    return pos_tagger.nouns(doc)

def tokenize_filtered(doc):
    tagger = Twitter()
    token_list = []
    for t in tagger.pos(doc, norm=True, stem=True):
        if t[1] != 'Josa' and t[1] != 'Punctuation' and t[1] != 'Determiner' and t[1] != 'URL' :
            token_list.append('/'.join(t))
    return token_list

In [4]:
frozen_str = ' '.join(df_frozen.text.values).decode('utf-8')
unfrozen_str = ' '.join(df_unfrozen.text2.values).decode('utf-8')

In [5]:
frozen_words = tokenize_filtered(frozen_str)
unfrozen_words = tokenize_filtered(unfrozen_str)

In [6]:
frozen_count = Counter(frozen_words)
unfrozen_count = Counter(unfrozen_words)

In [7]:
diff_frozen = {}
for k, v in frozen_count.items():
    if k in unfrozen_words:
        diff_frozen[k] = v - unfrozen_count[k]
    else :
        diff_frozen[k] = v

diff_unfrozen = {}
for k, v in unfrozen_count.items():
    if k in frozen_words:
        diff_unfrozen[k] = v - frozen_count[k]
    else:
        diff_unfrozen[k] = v

In [8]:
frozen_sort = sorted(diff_frozen.items(), key=lambda x: x[1], reverse=True)
unfrozen_sort = sorted(diff_unfrozen.items(), key=lambda x: x[1], reverse=True)

워드클라우드 생성

In [21]:
frozen_lst = []
for pos in frozen_sort[:20]:
    removed = re.sub(r'/\w+', '', pos[0])
    tup = (removed, pos[1])
    frozen_lst.append(tup)

unfrozen_lst = []
for pos in unfrozen_sort[:20]:
    removed = re.sub(r'/\w+', '', pos[0])
    tup = (removed, pos[1])
    unfrozen_lst.append(tup)

In [28]:
import pytagcloud

tag_frozen = frozen_lst
tag_unfrozen = unfrozen_lst

taglist = pytagcloud.make_tags(tag_frozen, maxsize=150)
taglist2 = pytagcloud.make_tags(tag_unfrozen, maxsize=100)

pytagcloud.create_tag_image(taglist, 'wordcloud_f.jpg', fontname='Korean', size=(700, 400))
pytagcloud.create_tag_image(taglist2, 'wordcloud_u.jpg', fontname='Korean', size=(900, 600))

### Stop words 파일 생성

In [36]:
frozen = np.array(df_frozen['text'])
frozen_str = ' '.join(frozen).decode('utf-8')
unfrozen = np.array(df_unfrozen['text2'])
unfrozen_str = ' '.join(unfrozen).decode('utf-8')

In [38]:
words = tokenize_basic(frozen_str)
counter = Counter(words)

In [39]:
stop_words = []
for k, v in counter.most_common(100):
    stop_words.append(k)

In [71]:
with open('stopwords.txt', 'w') as words :
    words.write(','.join(stop_words).encode('utf-8'))